#ifndef _H_TIRAMISU_CORE_
#define _H_TIRAMISU_CORE_

#include <isl/set.h>
#include <isl/map.h>
#include <isl/union_map.h>
#include <isl/union_set.h>
#include <isl/ast_build.h>
#include <isl/schedule.h>
#include <isl/schedule_node.h>
#include <isl/space.h>
#include <isl/constraint.h>

#include <map>
#include <string.h>
#include <stdint.h>
#include <unordered_map>
#include <unordered_set>
#include <sstream>

#include <Halide.h>
#include <tiramisu/debug.h>
#include <tiramisu/expr.h>
#include <tiramisu/type.h>
#include "cuda_ast.h"

namespace tiramisu
{
class view;
class input;
class function;
class computation;
class buffer;
class constant;
class generator;
class computation_tester;
class send;
class recv;
class send_recv;
class wait;
class sync;
class xfer_prop;

namespace auto_scheduler
{
class syntax_tree;
class ast_node;
class computation_info;
class evaluate_by_execution;
class dnn_access_matrix;
class simple_generator;
class state_computation;
class ml_model_schedules_generator;

void unroll_innermost_levels(std::vector<tiramisu::computation*> const& comps_list, int unroll_fact);
}

struct HalideCodegenOutput
{
    std::map<std::string, tiramisu::computation *> computation_list;
    std::map<std::string, tiramisu::constant *> constant_list;
    std::map<std::string, tiramisu::buffer *> output_buffers;

    HalideCodegenOutput(const std::map<std::string, tiramisu::computation *> &computations,
                        const std::map<std::string, tiramisu::constant *> &constants,
                        const std::map<std::string, tiramisu::buffer *> &buffers)
        : computation_list(computations), constant_list(constants), output_buffers(buffers) {}
};

Halide::Argument::Kind halide_argtype_from_tiramisu_argtype(tiramisu::argument_t type);

HalideCodegenOutput halide_pipeline_to_tiramisu_function(
    Halide::Internal::Stmt s,
    const std::vector<Halide::Internal::Function> &outputs,
    const std::map<std::string, Halide::Internal::Function> &env,
    const std::map<std::string, std::vector<int32_t>> &output_buffers_size,
    tiramisu::function *func);

computation *get_computation_annotated_in_a_node(isl_ast_node *node);

std::string generate_new_computation_name();

isl_map *isl_map_add_free_var(const std::string &free_var_name, isl_map *map, isl_ctx *ctx);
void split_string(std::string str, std::string delimiter, std::vector<std::string> &vector);
int loop_level_into_dynamic_dimension(int level);
int loop_level_into_static_dimension(int level);

enum xfer_attr {
    SYNC,
    ASYNC,
    BLOCK,
    NONBLOCK,
    MPI,
    CUDA,
    CPU2CPU,
    CPU2GPU,
    GPU2CPU,
    GPU2GPU
};

struct xfer {
    tiramisu::send *s;
    tiramisu::recv *r;
    tiramisu::send_recv *sr;
};

/**
 * Loop level, loop start, loop end, number of elements to collapse
 */
typedef std::tuple<int, tiramisu::expr, tiramisu::expr, tiramisu::expr> collapse_group;


//*******************************************************

/**
  * Initialize the Tiramisu compiler and set Tiramisu options to default
  * values.
  * \p fct_name is the name of the function to be generated by the Tiramisu compiler.
  * This is the name that the user will call to execute the generated code.
  */
void init(std::string name);

/**
  * \overload
  */
void init();

/**
  * \brief Generate code.
  *
  * \details
  *
  * This function generates the declared function and computations in an object file.
  * \p obj_filename is the relative path of the object file to be generated.
  * If \p gen_cuda_stmt is set to true, CUDA code is generated instead of code
  * targeting CPU (i.e., instead of generating Halide IR then LLVM IR).
  */
 void codegen(const std::vector<tiramisu::buffer *> &arguments, const std::string obj_filename, const bool gen_cuda_stmt = false, bool gen_python = false);
 void codegen(const std::vector<tiramisu::buffer *> &arguments, const std::string obj_filename, const tiramisu::hardware_architecture_t gen_architecture_flag, bool gen_python = false);

/**
 * Full check of schedule legality for this function using dependency analysis 
 * must be used after invoking : perform_full_dependency_analysis()
 */
bool check_legality_of_function();


/**
 * Performe a full dependency analysis RAW/WAR/WAW. The result is stored in the function's attributes
 * Before invoking this method, the user must call tiramisu::prepare_schedules_for_legality_checks() and must define the buffer associated with each computation.
 */
void perform_full_dependency_analysis();


/**
 * Prepare the schedules of the computations for legality checks for this implicit function by :
 * Aligning the schedules dimensions and generating the order between them.
 * If reset_static_dimesion is set to true then the computations static dimensions would be resetted to 0.
 * The execution order and static dimensions would be redefined back from \p sched_graph. i.e. the static dimensions manually specified with the low level interface would be lost.
 * Resetting static dimensions would allow to make changes to \p sched_graph using after or then, these changes would be correctly evaluated by the legality checks.
 * 
 */
void prepare_schedules_for_legality_checks(bool reset_static_dimesion = false);

 /**
     * Checks if the given fused computations could legally have their loop level \p i as parallel using dependence analysis and legality check.
     * It relies fully on the dependence analysis result, so the  method \p perform_full_dependency_analysis() must be invoked before.
     * To correctly invoke this method : schedules must be aligned (same out dimension size) and ordered,
     * so invoking \p prepare_schedules_for_legality_checks() method before is mandatory. 
  */
  bool loop_parallelization_is_legal(tiramisu::var i, std::vector<tiramisu::computation *> fused_computations);

  /**
  * Checks if the given fused computations could legally have their loop level \p i unrolled.
  */
  bool loop_unrolling_is_legal(tiramisu::var i, std::vector<tiramisu::computation *> fused_computations);

  /**
  * Checks if the given fused computations could legally have their loop level \p i vectorized.
  */
  bool loop_vectorization_is_legal(tiramisu::var i, std::vector<tiramisu::computation *> fused_computations);

//*******************************************************

/**
  * A class to represent functions in Tiramisu. A function in Tiramisu is composed of
  * a set of computations (tiramisu::computation).
  */
class function
{
    // Friend classes.  They can access the private members of the "function" class.
    friend buffer;
    friend computation;
    friend constant;
    friend generator;
    friend tiramisu::wait;
    friend cuda_ast::generator;
    
    friend auto_scheduler::syntax_tree;
    friend auto_scheduler::evaluate_by_execution;
    friend auto_scheduler::dnn_access_matrix;
    friend auto_scheduler::simple_generator;

private:
    /**
      * The name of the function.
      * Function names should not start with _ (an underscore).
      * Names starting with _ are reserved names.
      */
    std::string name;

    /**
     * True if this requires a call to MPI_Comm_rank or something similar.
     */
    bool _needs_rank_call;

    /**
     * Offset the rank by this amount.
     */
    int rank_offset = 0;

    /**
      * Function arguments. These are the buffers or scalars that are
      * passed to the function.
      */
    std::vector<tiramisu::buffer *> function_arguments;

    /**
      * A vector representing the invariants of the function (symbolic
      * constants or variables that are invariant to the function i.e.
      * do not change their value during the execution of the function).
      */
    std::vector<tiramisu::constant> invariants;

    /**
      * An ISL context associate with the function.
      */
    isl_ctx *ctx;

    /**
      * isl_union_map that describes the true dependencies considering buffers access (the read after write dependencies),
      * the relation describes for each read access the last previous write access that writes into the same buffer element
      * all isl_maps in the union_isl_map are represented in the format:
      * last_write -> [read_access -> used_buffer]
      * Isl methods (isl_map_range_factor_domain and isl_map_range_factor_range) could be used to the extract a more simple relation.
      * Useful to check correctness with legality checks.
      */

    isl_union_map * dep_read_after_write ;

    /**
      * isl_union_map that describes the false dependencies considering buffers access (the write after write dependencies),
      * the relation describes for each write access the last previous write access that write into the same buffer element
      * all isl_maps in the union_isl_map are represented in the format:
      * last_write -> [write_access -> used_buffer]
      * Isl methods (isl_map_range_factor_domain and isl_map_range_factor_range) could be used to the extract a more simple relation.
      * Useful to check correctness with legality checks.
      */

    isl_union_map * dep_write_after_write ; 

    
    /**
      * isl_union_map that describes the false dependencies considering buffers access (the write after read dependencies),
      * the relation describes for each write access all the previous read access that used a the previous value [a buffer element] before current write 
      * all isl_maps in the union_isl_map are represented in the format:
      * read_access_with_the_previous_value-> [ write_access -> used_buffer]
      * Isl methods (isl_map_range_factor_domain and isl_map_range_factor_range) could be used to the extract a more simple relation.
      * Useful to check correctness with legality checks.
      */

    isl_union_map * dep_write_after_read ;

    /**
     *  A union map that describes the live-in access (e.g., compulation[i,j]-> buffer1[i,j]).
     *  Read accesses that have no write before them (i.e., the value is external).
    */
    isl_union_map * live_in_access ;


    /**
    *  A union map that describes the live-out access (e.g., compulation[i,j]-> buffer1[i,j]).
    *  Write access that do not have another write after them (i.e., last written value into the buffer).
    */
    isl_union_map * live_out_access ;

    /**
      * An ISL AST representation of the function.
      * The ISL AST is generated by calling gen_isl_ast().
      */
    isl_ast_node *ast;

    /**
      * A vector representing the parallel dimensions around
      * the computations of the function.
      * A parallel dimension is identified using the pair
      * <computation_name, level>, for example the pair
      * <S0, 0> indicates that the loop with level 0
      * (i.e. the outermost loop) around the computation S0
      * should be parallelized.
      */
    std::vector<std::pair<std::string, int>> parallel_dimensions;

    /**
      * A vector representing the vectorized dimensions around
      * the computations of the function.
      * A vector dimension is identified using the tuple
      * <computation_name, level, length>, for example the tuple
      * <S0, 0, 4> indicates that the loop with level 0
      * (i.e. the outermost loop) around the computation S0
      * should be vectorized with a vector length 4.
      */
    std::vector<std::tuple<std::string, int, int>> vector_dimensions;

    /**
      * A vector representing the distributed dimensions around
      * the computations of the function.
      * A distributed dimension is identified using the pair
      * <computation_name, level>, for example the pair
      * <S0, 0> indicates that the loop with level 0
      * (i.e. the outermost loop) around the computation S0
      * should be distributed.
      */
    std::vector<std::pair<std::string, int>> distributed_dimensions;

    /**
      * A vector representing the GPU block dimensions around
      * the computations of the function.
      * GPU dimensions dimensions are dimensions that should be mapped
      * to parallel GPU dimensions.
      * GPU dimensions are identified using the tuple
      * <computation_name, level0, level1, level2>, for example the tuple
      * <S0, 0, 1, 2> indicates that the loops with levels 0, 1 and 2
      * (i.e. the three outermost loops) around the computation S0
      * should be mapped to GPU dimensions.
      * Level1 must be the level following level0, i.e.
      * level1 == level0 + 1
      * and level2 must be the level following level1
      */
    std::vector<std::pair<std::string, std::tuple<int, int, int>>> gpu_block_dimensions;

    /**
     * Similar to gpu_dimensions but used to store information about GPU thread dimensions.
     * i.e., dimensions that should mapped to threads (in CUDA terminology).
     */
    std::vector<std::pair<std::string, std::tuple<int, int, int>>> gpu_thread_dimensions;

    /**
      * A vector representing the dimensions that should be unrolled
      * around the computations of the function.
      * Unrolled dimensions are identified using the tuple
      * <computation_name, level0, factor>, for example the tuple
      * <S0, 1, 8> indicates that the loops with level 1
      * around the computation S0 should be unrolled with an unrolling
      * factor of 8.
      */
    std::vector<std::tuple<std::string, int, int>> unroll_dimensions;

    /**
      * Body of the function (a vector of computations).
      * The order of the computations in the vector does not have any
      * effect on the actual order of execution of the computations.
      * The order of execution of computations is specified through the
      * schedule.
      */
    std::vector<computation *> body;

    /**
      * A Halide statement that represents the whole function.
      * This value stored in halide_stmt is generated by the code generator
      * and is empty before calling the code generator.
      */
    Halide::Internal::Stmt halide_stmt;

    /**
      * A map representing the buffers of the function. Some of these
      * buffers are passed to the function as arguments and some are
      * declared and allocated within the function itself.
      */
    std::map<std::string, tiramisu::buffer *> buffers_list;

    /**
     * The context set of the function, i.e. a set representing the
     * constraints over the parameters.
     * The parameters of a function are the function invariants (constants).
     */
    isl_set *context_set;

    /**
     * The names of the iterators.
     */
    std::vector<std::string> iterator_names;

    /**
     * Map from loop iterators to CUDA kernels.
     */
    std::unordered_map<isl_ast_node*, cuda_ast::kernel_ptr> iterator_to_kernel_map;

    std::shared_ptr<cuda_ast::compiler> nvcc_compiler;

    /**
      * Tag the dimension \p dim of the computation \p computation_name to
      * be parallelized.
      * The dimension 0 represents the outermost loop level (it
      * corresponds to the leftmost dimension in the iteration space).
      */
    void add_parallel_dimension(std::string computation_name, int vec_dim);

    /**
      * Tag the dimension \p dim of the computation \p computation_name to
      * be vectorized. \p len is the vector length.
      * The dimension 0 represents the outermost loop level (it
      * corresponds to the leftmost dimension in the iteration space).
      */
    void add_vector_dimension(std::string computation_name, int vec_dim, int len);

    /**
      * Tag the dimension \p dim of the computation \p computation_name to
      * be distributed.
      * The dimension 0 represents the outermost loop level (it
      * corresponds to the leftmost dimension in the iteration space).
      */
    void add_distributed_dimension(std::string computation_name, int dim);

    /**
      * Tag the loop level \p L of the computation
      * \p computation_name to be unrolled.
      * The dimension 0 represents the outermost loop level (it
      * corresponds to the leftmost dimension in the iteration space).
      * \p factor in the unrolling factor.
      */
    void add_unroll_dimension(std::string stmt_name, int L, int factor);
    
    /**
     * \brief Remove parallel, vectorized, distributed, unrolled and GPU tags
     * on every computations.
     */
    void remove_dimension_tags();

    /**
     * Get live in/out computations in the function.
     */
    // @{
    std::vector<tiramisu::computation *> get_live_in_computations();
    std::vector<tiramisu::computation *> get_live_out_computations();
    // @}

    /**
      * Recursive function to perform the DFS step of dump_sched_graph.
      */
    void dump_sched_graph_dfs(tiramisu::computation *,
                              std::unordered_set<tiramisu::computation *> &);

    /**
      * Recursive function to perform the DFS step of is_sched_graph_tree.
      */
    bool is_sched_graph_tree_dfs(tiramisu::computation *,
                                 std::unordered_set<tiramisu::computation *> &);
    /**
     * This functions iterates over the iteration domain of the computations
     * of the function and computes the maximum dimension among the dimensions
     * of these iteration domains.
     */
    int get_max_iteration_domains_dim() const;


    /**
     * This functions iterates over the schedules of the function (the schedule
     * of each computation in the function) and computes the maximal dimension
     * among the dimensions of the ranges of all the schedules.
     */
    int get_max_schedules_range_dim() const;

    /**
      * This function first computes the identity schedules,
      * then it computes the maximal dimension among the dimensions
      * of the ranges of all the identity schedules.
      */
    int get_max_identity_schedules_range_dim() const;

    /**
      * Get the trimmed time-processor domain of the function.
      * The first dimension of the time-processor domain is used
      * to indicate duplicates of the computation.  Computations that
      * do not have any duplicate have 0 in that dimension, whereas
      * computations that have duplicates (i.e., are recomputed) have
      * a number in that dimension to represent each duplicate.
      * The trimmed time-processor domain is the time-processor domain
      * without the dimension that represents the duplicates. We simply
      * take the time-processor domain and remove the first dimension
      * used to represent the duplicates.
      */
    isl_union_set *get_trimmed_time_processor_domain() const;

    /**
      * This function iterates over the computations of the function.
      * It modifies the identity schedule of each computation in order to
      * make all the identity schedules have the same number of dimensions
      * in their ranges.
      * This is done by adding dimensions equal to 0 to the range of each
      * identity schedule that does not have enough dimensions.
      */
    isl_union_map *get_aligned_identity_schedules() const;

    /**
     * A pass to rename computations.
     * Computation that are defined multiple times need to be renamed, because
     * those computations in general have different expressions and the code
     * generator expects that computations that have the same name always have
     * the same expression and access relation. So we should rename them to avoid
     * any ambiguity for the code generator.
     *
     */
    void rename_computations();

    /**
     * This method solve the problem of finding the best skewing parameters for 2 dimensional case.
     * It uses dependence analysis to create a skewing parameters (a,b) space for 5 main use cases,
     * each use case with it's isl_basic_set. In total this method should return a vector of 5 elements:
     * 
     * First element [0] : set of (a,b) where a>0 and b>0 that weakly solves the dependencies.
     * Second [1]: set of (a,b) where a>0 and b>0 that strongly solves the dependencies.
     * Third [2]: set of (a,b) where a>0 and b<0 that weakly solves the dependencies.
     * Forth [3]: set of (a,b) where a>0 and b<0 that strongly solves the dependencies.
     * Fifth [4]: set of (a,b) where we could have parallelism on outermost loop level.
     * 
     * The inputs are a vector of fused computations that we want to apply skewing onto,
     * and 2 consecutive loop variables (inner & outer).
     * 
     * It should return integer \p legal_process that describes the operation with 3 states depending on it's result:
     *  1  : correct process with correctly involved dependencies.
     *  0  : correct process while no dependencies were solved.
     * -1  : illegal process (impossible to solve dependencies)
     */
    std::vector<isl_basic_set*> compute_legal_skewing(std::vector<tiramisu::computation *> fused_computations, tiramisu::var outer_variable,
                                              tiramisu::var inner_variable, int&  legal_process);



protected:

    /**
      * Add a buffer to the function.
      * The buffers of the function are either:
      * - buffers passed to the function as arguments, or
      * - buffers that are declared and allocated within the function
      * itself.
      * The first element of the pair is the name of the buffer (it is
      * used as a key), the second element of the pair is a pointer
      * to the buffer.
      */
    void add_buffer(std::pair<std::string, tiramisu::buffer *> buf);

    /**
      * Add a computation to the function.  The order in which
      * computations are added to the function is not important.
      * The order of execution is specified using the schedule.
      * This doesn't allow computations with duplicate names.
      */
    void add_computation(computation *cpt);

    /**
      * Tag the dimensions \p dim0, \p dim1 and \p dim2 of the computation
      * \p computation_name to be mapped to GPU blocks.
      * The dimension 0 represents the outermost loop level (it
      * corresponds to the leftmost dimension in the iteration space).
      *
      * If the user does not want to tag \p dim1 or \p dim2, he can leave
      * their values to default value (i.e., -1).  They will not be tagged.
      *
      * For example
      *
      * add_gpu_block_dimensions("S0", 1, 2);
      *
      * Will tag the dimensions 1 and 2 to be transformed to GPU blocks.
      */
    void add_gpu_block_dimensions(std::string stmt_name, int dim0, int dim1 = -1, int dim2 = -1);

    /**
     * Tag the dimensions \p dim0, \p dim1 and \p dim2 of the computation
     * \p computation_name to be mapped to GPU threads.
     * The dimension 0 represents the outermost loop level (it
     * corresponds to the leftmost dimension in the iteration space).
     *
     * If the user does not want to tag \p dim1 or \p dim2, he can leave
     * their values to default value (i.e., -1).  They will not be tagged.
     *
     * For example
     *
     * add_gpu_block_dimensions("S0", 1, -1, -1);
     *
     * Will tag the dimension 1 to be transformed to GPU threads.
     */
    void add_gpu_thread_dimensions(std::string stmt_name, int dim0, int dim1 = -1, int dim2 = -1);

    /**
     * Add an invariant to the function.
     */
    void add_invariant(tiramisu::constant param);

    /**
      * Add an iterator to the function.
      */
    void add_iterator_name(const std::string &it_name);

    /**
      * Keeps track of allocation computations created using
      * allocate_and_map_buffer_automatically() to schedule them during gen_ordering_schedules.
      */
     std::vector<tiramisu::computation *> automatically_allocated;

    /**
      * Compute the graph of dependences between the computations of
      * the function.
      *
      * Example
      *
      * C[0] = 0
      * D[1] = C[0]
      * D[2] = C[0]
      * {C[0] -> D[1]; C[0]->D[2]}
      */
    isl_union_map *compute_dep_graph();

    /**
      * Get the arguments of the function.
      */
    const std::vector<tiramisu::buffer *> &get_arguments() const;

    /**
      * Return a map that represents the buffers of the function.
      * The buffers of the function are buffers that are either passed
      * to the function as arguments or are buffers that are declared
      * and allocated within the function itself.
      * The names of the buffers are used as a key for the map.
      */
    const std::map<std::string, tiramisu::buffer *> &get_buffers() const;

    /**
      * Return a vector of the computations of the function.
      * The order of the computations in the vector does not have any
      * effect on the actual order of execution of the computations.
      * The order of execution of computations is specified through the
      * schedule.
      */
    const std::vector<computation *> &get_computations() const;

    /**
      * Return the computation of the function that has
      * the name \p str.
      */
    std::vector<computation *> get_computation_by_name(std::string str) const;

    /**
      * Return a string representing the name of the GPU block iterator at
      * dimension \p lev0.
      * This function only returns a non-empty string if the
      * computation \p comp is mapped to a GPU block at the dimension \p lev0.
      */
    std::string get_gpu_block_iterator(const std::string &comp, int lev0) const;

    /**
      * Return a string representing the name of the GPU thread iterator at
      * dimension \p lev0.
      * This function only returns a non-empty string if the
      * computation \p comp is mapped to a GPU thread at the dimension \p lev0.
      */
    std::string get_gpu_thread_iterator(const std::string &comp, int lev0) const;

    /**
      * Return the isl_ctx associated with this function.
      * This is an ISL specific object required when calling certain
      * ISL functions.  It does not represent the set of parameters
      * of the function (which should be retrieved by calling
      * get_program_context()).
      */
    isl_ctx *get_isl_ctx() const;

    /**
      * Return the Halide statement that represents the whole
      * function.
      * The Halide statement is generated by the code generator.
      * This function should not be called before calling the code
      * generator.
      */
    Halide::Internal::Stmt get_halide_stmt() const;

    /**
      * Return a vector representing the invariants of the function
      * (symbolic constants or variables that are invariant to the
      * function i.e. do not change their value during the execution
      * of the function).
      *
      * get_invariant_names() returns the names of the variants.
      */
    //@{
    const std::vector<tiramisu::constant> &get_invariants() const;
    const std::vector<std::string> get_invariant_names() const;
    //@}

    /**
      * Return an ISL AST that represents this function.
      * This function itself does not generate the ISL AST, it just
      * returns it if it already exists.
      * The function gen_isl_ast() should be called before calling
      * this function.
      */
    isl_ast_node *get_isl_ast() const;

    /**
      * Return the union of all the iteration domains
      * of the computations of the function.
      */
    isl_union_set *get_iteration_domain() const;

    /**
      * Get the iterator names of the function.
      */
    const std::vector<std::string> &get_iterator_names() const;


    /**
      * Return a set that represents the parameters of the function
      * (an ISL set that represents the parameters and constraints over
      * the parameters of the functions,  a parameter is an invariant
      * of the function). This set is also known as the context of
      * the program.
      * An example of a context set is the following:
      *          "[N,M]->{: M>0 and N>0}"
      * This context set indicates that the two parameters N and M
      * are strictly positive.
      */
    isl_set *get_program_context() const;

    /**
      * Return the union of all the schedules
      * of the computations of the function.
      */
    isl_union_map *get_schedule() const;

    /**
      * Return the union of all the trimmed schedules of the
      * function.
      * A trimmed schedule is the schedule without the duplication
      * dimension (the schedule dimension used to indicate duplicate
      * computations).
      */
    isl_union_map *get_trimmed_schedule() const;

    /**
      * Return the union of time-processor domains of each
      * computation in the function.
      * In the time-processor representation, the logical time of
      * execution and the processor where the computation will be
      * executed are both specified.
      */
    isl_union_set *get_time_processor_domain() const;

    /**
     * If the computation \p comp is vectorized, return its vector length
     * at the loop level \p lev.
     */
    int get_vector_length(const std::string &comp, int lev) const;

    /**
     * If the computation \p comp is unrolled at the loop level \p lev,
     * return its unrolling factor.
     */
    int get_unrolling_factor(const std::string &comp, int lev) const;

   /**
     * Return true if the usage of high level scheduling comments is valid; i.e. if
     * the scheduling relations formed using before, after, compute_at, etc.. form a tree.
     *
     * More specifically, it verifies that:
     *     - There should be exactly one computation with no computation scheduled before it.
     *     - Each other computation should have exactly one computation scheduled before it.
     */
    bool is_sched_graph_tree();

    /**
     * Modify the schedules of the computations of this function to reflect
     * the order specified using the high level scheduling commands.
     *
     * Commands like .after() and .before() do not directly modify the schedules
     * but rather modify the sched_graph graph.
     */
    void gen_ordering_schedules();

    /**
      * Stores all high level scheduling instructions between computations; i.e. if a user calls
      * for example c2.after(c1, L), sched_graph[&c1] would contain the key &c2, and
      * sched_graph[&c1][&c2] = L.
      * At the end of scheduling, the graph should respect the following rules:
      *     - There should be exactly one computation with no computation scheduled before it.
      *     - Each other computation should have exactly one computation scheduled before it.
      * In other words, the graph should be a valid tree.
      * Does not include allocation computations created using
      * allocate_and_map_buffer_automatically().
      */
    std::unordered_map<tiramisu::computation *,
    std::unordered_map<tiramisu::computation *, int>> sched_graph;

    /**
      * Same as sched_graph, except in reverse order (from after to before).
      */
    std::unordered_map<tiramisu::computation *,
    std::unordered_map<tiramisu::computation *, int>> sched_graph_reversed;

    /**
     * Set the iterator names of the function.
     * This function overrides any previously set iterator names.
     */
    void set_iterator_names(const std::vector<std::string> &it_names);

    /**
      * Return true if the computation \p comp should be mapped to GPU block
      * at the loop levels \p lev0.
      */
    bool should_map_to_gpu_block(const std::string &comp, int lev0) const;

    /**
      * Return true if the computation \p comp should be mapped to GPU thread
      * at the loop levels \p lev0.
      */
    bool should_map_to_gpu_thread(const std::string &comp, int lev0) const;

    /**
      * Return true if the computation \p comp should be parallelized
      * at the loop level \p lev.
      */
    bool should_parallelize(const std::string &comp, int lev) const;

    /**
      * Return true if the computation \p comp should be unrolled
      * at the loop level \p lev.
      */
    bool should_unroll(const std::string &comp, int lev) const;

    /**
      * Return true if the computation \p comp should be vectorized
      * at the loop level \p lev.
      */
    bool should_vectorize(const std::string &comp, int lev) const;

    /**
      * Return true if the computation \p comp should be distributed
      * at the loop level \p lev.
      */
    bool should_distribute(const std::string &comp, int lev) const;

    /**
      * This computation requires a call to the MPI_Comm_rank function.
      */
    bool needs_rank_call() const;

    /**
      * Lift certain computations for distributed execution to function calls.
      */
    void lift_mpi_comp(tiramisu::computation *comp);

    /**
      * Lift certain computations for distributed execution to function calls.
      */
    void lift_dist_comps();

    /**
      * The set of all computations that have no computation scheduled before them.
      * Does not include allocation computations created using
      * allocate_and_map_buffer_automatically().
      */
    std::unordered_set<tiramisu::computation *> starting_computations;

    /**
     * A boolean set to true if low level scheduling was used in the program.
     * If it is used, then high level scheduling commands such as .before(),
     * .after(), ...
     */
    bool use_low_level_scheduling_commands;

    /**
     * \brief Generates the automatic communication CPU/GPU.
     * \details This fucntion takes two pointers to the first and the last computation
     *  of the fucntion.
     *  C1 is the first computation, which means that it hasn't a predecessor.
     *  C2 is the last computation, which means that it hasn't a successor.
     */
    void Automatic_communication(tiramisu::computation* c1, tiramisu::computation* c2);

    /**
     * \brief Returns a ptr to the first computation in the sched_graph.
     * \details The computation that has no predecessor.
     */
    computation* get_first_cpt();

    /**
     * \brief Returns a ptr to the last computation in the sched_graph.
     * \details The computation that has no succesor.
     */
    computation* get_last_cpt();

    /**
     * \brief This map contains the names of the cpu buffers and the pointers of the corresponding gpu buffers.
     * \details It is modified when creating a gpu buffer, please have a look at the buffer constructor.
     */
    std::map<std::string, tiramisu::buffer *> mapping;

    /**
     * \brief Returns the mapping field of a given function.
     * \details It returns a pair of a string, which is the name of a cpu buffer, and a ptr to a
     *  to the gpu buffer corresponding.
     */
    const std::map<std::string, tiramisu::buffer *> get_mapping() const;

     /**
       * \brief Adds a new pair to the mapping field.
       */
    void add_mapping(std::pair<std::string, tiramisu::buffer *> p);
    
    /**
     * \brief Clear any relation (defined by after, then or between) between computations.
     */
    void clear_sched_graph();

public:

        /**
      * Get the name of the function.
      */
    const std::string &get_name() const;


    /**
     * \brief Construct a function called \p name.
     * \details Function names should not start with _ (an underscore).
     * Names starting with _ are reserved names.
     */
    function(std::string name);

    /**
      * \brief Add a set of constraints to the context of the program.
      *
      * \details This command is useful for providing constraints over the constants
      * used within a tiramisu function.
      * The context of a function is represented as an ISL set that represents
      * constraints over the parameters of the function (a parameter of a
      * function is a constant used in that function).
      *
      * For example, if the constants M and N are known to be positive, it is beneficial
      * to provide such an information to the Tiramisu compiler as follows:
      *     f.add_context_constraints("[N,M]->{: M>0 and N>0}");
      * This context set indicates that the two parameters N and M
      * are strictly positive in the function f.
      *
      * \p new_context should have the same space as the context set.
      * This call intersects the set \p new_context
      * (input) with the context of the function.
      */
    void add_context_constraints(const std::string &new_context);

    /**
      * \brief Align the schedules of all the computations of this
      * function.
      *
      * This method applies to the schedule of each computation
      * in the function.  It makes the dimensions of the ranges of
      * all the schedules equal.  This is done by adding dimensions
      * equal to 0 to the range of each schedule.
      * This function is called automatically when gen_isl_ast()
      * or gen_time_processor_domain() are called.
      */
    void align_schedules();
    
    /**
     * \brief Remove, for every computation, every schedule.
     */
    void reset_schedules();

    /**
     * \brief For each computation, allocate a buffer and map the computation
     * to that buffer.
     *
     * \details For each computation in the function:
     *  - Allocate a buffer where the size of the buffer is derived automatically.
     *    Assuming the name of the computation is C, the name of the generated buffer
     *    is _C_buffer.
     *  - Map the computation to the allocated buffer (one-to-one mapping).
     *    For more details about one-to-one mapping, see computation::store_in.
     */
    void allocate_and_map_buffers_automatically();

    /**
      * \brief Compute the bounds of each computation.
      *
      * \details Computing the bounds of each computation means computing
      * the constraints over the iteration domains of each computation in
      * the function.
      *
      * In order to deduce bounds, Tiramisu first identifies the final consumers
      * in the function (i.e., computations that do not have any consumer).
      * Then, it propagates the bounds over the final consumers to their producers.
      * The bounds of each consumer are used to deduce the bounds over its producer.
      *
      * To take benefit of bound inference, the user can declare computations
      * without providing constraints on their iteration domains. For example
      * the user can declare the following computations (the left side is the
      * iteration domain, while the right side is the expression attached to
      * each computation)
      *
      * \code
      * {A[i]        } : 0
      * {B[i]        } : 0
      * {C[i]        } : A[i] + B[i]
      * {D[i]: 0<=i<N} : 2*C[i]
      * \endcode
      *
      *
      * The user needs only to provide constraints over the domains of the
      * last computations (last consumers), and Tiramisu will propagate
      * these constraints to all the chain of computations that produce for
      * those consumers.
      * In the previous example, constraints over the iteration domain were
      * only provided for the last consumer "D[i]" and no constraints were
      * provided for the other computations.  Bound inference would deduce
      * the constraints for the computations A[i], B[i] and C[i].
      *
      * Note that bound inference is not possible if you have multiple definitions
      * of the same computation. For example, if you have multiple definitions
      * of the same computations, in such a case the user should provide
      * constraints of the iteration domain of the computation.  Example:
      *
      * \code
      * {A[i]        } : 0
      * {C[i]: i=0   } : 0
      * {C[i]: 1<=i<N} : C[i-1] + A[i]
      * {D[i]: 0<=i<N} : 2*C[i]
      * \endcode
      *
      * In this case, constraints over the computations defining C[i] should be
      * provided.
      */
    void compute_bounds();

    /**
      * \brief Dump the function on standard output (dump most of the fields of
      * tiramisu::function).
      * \details This is mainly useful for debugging.
      * If \p exhaustive is set to true, all the fields of the function
      * class are printed.
      */
    void dump(bool exhaustive) const;

    /**
     * \brief Dump the graph of dependences between computations.
     * \details The graph of dependences is a union of maps (relations) from producers
     * to consumers.
     */
    void dump_dep_graph();

    /**
      * \brief Dump a Halide stmt that represents the function.
      * \details tiramisu::function::gen_halide_stmt should be called before calling
      * this function.
      */
    void dump_halide_stmt() const;

    /**
      * \brief Dump the iteration domain of the function.
      * \details This is mainly useful for debugging.
      */
    void dump_iteration_domain() const;

    /**
      * \brief Dump the schedules of the computations of the function.
      * \details This function is mainly useful for debugging.
      * See tiramisu::computations::set_low_level_schedule for details about the
      * schedule.
      */
    void dump_schedule() const;

    /**
      * \brief Dumps the graph of scheduling relations set by the higher level scheduling
      * functions (e.g. after, before, compute_at...).
      * \details This is mainly useful for debugging.
      * This function can be called at any point during scheduling.
      */
    void dump_sched_graph();

    /**
      * \brief Dump (on stdout) the time processor domain of the function.
      * \details The time-processor domain should be generated using
      * tiramisu::function::gen_time_processor_domain before calling this
      * function.
      * This is mainly useful for debugging.
      */
    void dump_time_processor_domain() const;

    /**
      * \brief Dump (on stdout) the trimmed time processor domain of the function.
      * \details The time-processor domain should be generated using
      * tiramisu::function::gen_time_processor_domain before calling this function.
      * This is mainly useful for debugging.
      * The difference between the time-processor domain and the trimmed
      * time-processor domain is that the trimmed one does not have the
      * duplicate dimension.  We remove it before printing.
      * The trimmed time-processor domain is the domain used for code
      * generation.
      */
    void dump_trimmed_time_processor_domain() const;

    /**
      * \brief Generate C code and print it on stdout.
      * \details Currently C code code generation is very basic and does not
      * support many features compared to the Halide code generator.
      * Use this for debugging only.
      */
    void gen_c_code() const;

    // ADD:FLEXNLP
    /**
      * \brief Generate autocopy statements for FlexNLP.
      * \details This function generates automatic copy computations
      * and adds them to the code to avoid to the user to do manual
      * data loading to the device.
      */
    void gen_flexnlp_autocopy();

    // TODO:FLEXNLP Add documentation
    void gen_halide_bug_workaround_computations();

    /**
      * \brief Generate an object file that contains the compiled function.
      * \details This function relies on Halide to generate the object file.
      *
      * \p obj_file_name indicates the name of the generated file.
      *
      * \p os indicates the target operating system (Halide::Target::OS).
      *
      * \p arch indicates the architecture of the target (the instruction set).
      *
      * \p bits indicate the bit-width of the target machine.
      *    must be 0 for unknown, or 32 or 64.
      * \p hw_architecture indicate the hardware architecture, it has to be one
      *    of arch_flexnlp, arch_cpu, arch_nvidia_gpu.
      * For a full list of supported values for \p os and \p arch please
      * check the documentation of Halide::Target
      * (http://halide-lang.org/docs/struct_halide_1_1_target.html).
      * If the machine parameters are not supplied, Halide detects
      * the parameters of the host machine automatically.

      */
    void gen_halide_obj(const std::string &obj_file_name, Halide::Target::OS os,
                        Halide::Target::Arch arch, int bits,
                        const tiramisu::hardware_architecture_t hw_architecture,
			bool gen_python = false) const;

    /**
      * \overload
      */
    void gen_halide_obj(const std::string &obj_file_name, Halide::Target::OS os,
                        Halide::Target::Arch arch, int bits, bool gen_python = false) const;

    /**
      * \overload
      */
    void gen_halide_obj(const std::string &obj_file_name, bool gen_python = false) const;

    /**
      * \overload
      */
    void gen_halide_obj(const std::string &obj_file_name, const tiramisu::hardware_architecture_t hw_architecture, bool gen_python = false) const;

    /**
      * Generate a Halide stmt that represents the function.
      */
    void gen_halide_stmt();

    void gen_cuda_stmt();

    /**
      * Generate an isl AST that represents the function.
      */
    void gen_isl_ast();

    /**
      * Generate the time-space domain of the function.
      *
      * In this representation, the logical time of execution and the
      * processor where the computation will be executed are both
      * specified.
      */
    void gen_time_space_domain();

    /**
      * Return the invariant of the function that has
      * the name \p str.
      */
    constant* get_invariant_by_name(std::string str) const;

    /**
      * Set the arguments of the function.
      * The arguments of the function are provided as a vector of
      * pointers to buffers. Each buffer represents an argument
      * to the function.
      * During code generation, the arguments in the vector will become
      * the arguments of the generated function (with the order of their
      * appearance in the vector).
      */
    void set_arguments(const std::vector<tiramisu::buffer *> &buffer_vec);

    /**
     * Wrapper for all the functions required to run code generation of a
     * tiramisu program.
     */
    void codegen(const std::vector<tiramisu::buffer *> &arguments, const std::string obj_filename, const bool gen_cuda_stmt = false, bool gen_python = false);
    void codegen(const std::vector<tiramisu::buffer *> &arguments, const std::string obj_filename, const tiramisu::hardware_architecture_t gen_architecture_flag, bool gen_python = false);

    /**
     * \brief Set the context of the function.
     * \details A context is an ISL set that represents constraints over the
     * parameters of the functions (a parameter is an invariant variable for
     * the function).
     * An example of a context set is the following:
     *          "[N,M]->{: M>0 and N>0}"
     * This context set indicates that the two parameters N and M
     * are strictly positive.
     *
     * This function takes a string that represents and ISL set.
     */
    void set_context_set(const std::string &context);

    /**
      * \overload
      *
      * This function takes an ISL set as input.
      */
    void set_context_set(isl_set *context);

    /**
      * Computes flow and perform data analysis for this function with all it's computations.
      * This includes Reads after write, Write after write, Write after read, live_out_access, and live_in_access.
      * The moment of the call, the computations order and their buffers must be defined, the schedules must be the default ones with no optimizations applied. 
      * So this method should be invoked directly after mapping computations to their buffers.
      * Result is the stored in the attributes of the function "live_out_access, live_in_access, deps_read_after_write, deps_write_after_write ...".
      * These attributes helps to check the legality of schedules using \p check_legality_for_function() method in the function class, 
      * or \p involved_subset_of_dependencies_is_legal() method in the computation class.
      * This method also computes live_out and live_in access for this function.
      * After the call the user is free to change & optimize the schedules.
      */
    void perform_full_dependency_analysis();
  
    /**
     *  Uses the dependency analysis to check if the current schedules of all computations are legal
     *  must be invoked after the correct call to \p perform_full_dependency_analysis()
    */
    bool check_legality_for_function();

    /**
     * Calculate all the dependencies in the function RAW/WAW/WAR & store in the function's attributes
     * All schedules must be ordered (after or then), and with same length using:
     * 1-gen_ordering_schedules
     * 2-align_schedules
    */
    void calculate_dep_flow() ;

    /**
     * Align schedules dimensions and adds the computation's order to them. 
     * This is done to correctly invoke calculate_dep_flow() method that performs dependence analysis
     * It calls gen_ordering_schedules() and align_schedules() function's methods internally.
     * if \p reset_static_dimesion is true then it will also call function.reset_all_static_dims_to_zero() before ordering.
    */
    void prepare_schedules_for_legality_checks(bool reset_static_dimesion = false) ;


    /**
     * Checks if the given fused computations could legally have their loop level \p i as parallel using dependence analysis and legality check.
     * It relies fully on the dependence analysis result, so the  method \p perform_full_dependency_analysis() must be invoked before.
     * To correctly invoke this method : schedules must be aligned (same out dimension size) and ordered,
     * so invoking \p prepare_schedules_for_legality_checks() method before is mandatory. 
    */
    // @{
    bool loop_parallelization_is_legal(tiramisu::var i, std::vector<tiramisu::computation *> fused_computations);

    bool loop_parallelization_is_legal(int parallel_dim, std::vector<tiramisu::computation *> fused_computations);
    // @}

    /**
     * Checks if the given fused computations could legally have their loop level \p i unrolled.
    */
    bool loop_unrolling_is_legal(tiramisu::var i, std::vector<tiramisu::computation *> fused_computations);

    /**
     * Checks if the given fused computations could legally have their loop level \p i vectorized.
    */
    bool loop_vectorization_is_legal(tiramisu::var i, std::vector<tiramisu::computation *> fused_computations);

    /**
     * resets all the static beta dimensions in all the computations to Zero.
     * This would allow the execution of fuction.generate_ordering many times without issues.
     * Although, the static beta dimenesion and ordering that are not specified using the scheduling graph (after or then) would be lost.
    */
    void reset_all_static_dims_to_zero();

    /**
     * Automatically computes the shifting parameters for variables \p vars_subjected_to_shifting that would allow to legally fuse \p current computation,
     * with the vector of computations \p previous_computations if it is possible.
     * This method return a vector of tuples mapping each variable with the required shifting if the fusion is possible, and an empty vector otherwise(impossible fusion).
     * Note: In case where the fusion is legal and doesn't require shifting, the vector of tuples would map the variable to 0.
     * The method relies fully on the dependence analysis result, so the  method \p perform_full_dependency_analysis() must be invoked before.
     * To correctly invoke this method : schedules must be aligned (same out dimension size) and ordered,
     * so invoking \p prepare_schedules_for_legality_checks() method before is mandatory. 
     * The shifting parameters given are always superior or equal to zero. This is an additional internal condition.
    */
    std::vector<std::tuple<tiramisu::var,int>> correcting_loop_fusion_with_shifting(std::vector<tiramisu::computation*> previous_computations, tiramisu::computation current, std::vector<tiramisu::var> vars_subjected_to_shifting);

    /**
     * Uses the dependency analysis to check if the specified schedules of computations are legal.
     * This method only tests the dependencies between the computations specified in the input and ignore the rest.
     * must be invoked after the correct call to \p perform_full_dependency_analysis()
    */
    bool check_partial_legality_in_function(std::vector<tiramisu::computation * > involved_computations);

    /**
     * \brief Computes the best legal skewing parameters for 3 use cases (outer parallelism, locality and innermost parallelism)
     *  on 2 consecutive loops.
     * \attention the method uses the dependence analysis results, so invoking \p prepare_schedules_for_legality_checks() 
     *  (to align and order schedules with same out dimension size)
     *  then \p perform_full_dependency_analysis() before is mandatory. 
     * 
     * \param fused_computations All computations inside the 2 consecutive loops
     * \param outer_variable the outer of the 2 consecutive loops
     * \param inner_variable the inner of the 2 consecutive loops
     * 
     * \returns a tuple of vectors, each vector represents possible skewings (with 2 parameters),
     * each skewing can be used as an input to Computation.skew().
     * 
     * \note Skewing is useful to enable parallelism in some cases where it was impossible initially.
     * if the \p outer_variable is already parallel with \p loop_parallelization_is_legal  then THIS skewing is not needed.
     * This solver will try to find a skewing, sometimes it will fail and no skewing could be found to enable parallelism.
     * (if the dependencies are empty or not empty but impossible to solve)
     * In case pairs are returned, they enable parallelism on either \p outer_variable or \p inner_variable.
     * 
     * \details First vector contains either 1 pairs of <int,int> that allows parallelism on outer_variable, or an empty vector.
     * Second vector contains a vector of pairs that enables parallelism on inner_variable.
     * Third vector contains a vector of parameters that should in theory improve locality (without any parallelism).
     * 
     * nb_parallel is the number of solutions (pairs) inside the second vector (parallelism on inner_variable),
     * the second vector size's should be equal to twice the value of nb_parallel in the regular case.
     * for nb_parallel=1 it only returns the smallest skewing (best) possible for this use case.
    */
    std::tuple<
      std::vector<std::pair<int,int>>,
      std::vector<std::pair<int,int>>,
      std::vector<std::pair<int,int>>> skewing_local_solver(std::vector<tiramisu::computation *> fused_computations,
                                                            tiramisu::var outer_variable,tiramisu::var inner_variable, int nb_parallel);

    /**
     * Computes the best legal skewing parameters for 3 use cases (outer parallelism, innermost parallelism and identity).
     * This method also make sure the dependencies becomes positive with skewing in order to enable Tiling. 
     * The method relies fully on the dependence analysis result, so the  method \p perform_full_dependency_analysis() must be invoked before.
     * To correctly invoke this method : schedules must be aligned (same out dimension size) and ordered,
     * so invoking \p prepare_schedules_for_legality_checks() method before is mandatory. 
     * The output of this method is a tuple of vectors, each vector represent a usecase,
     * the elements of the vector are the 4 skewing parameters (alpha,beta,gamma,sigma) that should be given 
     * as an input for Computation.skew() method (skewing method that takes 4 parameters).
     * 
     * 
     * First vector contains either 1 set of parameters that allows parallism on outer_variable, or an empty vector.
     * Second vector contains a vector of parameters that enables parallism on inner_variable.
     * Third vector contains a vector of parameters that enables tiling and makes the dependencies positive for identity (alpha=1, beta=0).
     * 
     * nb_parallel is the number of solutions (pairs) inside the second vector (parallism on inner_variable),
     * the second vector size's should be equal to twice the value of nb_parallel in the regular case.
     * for nb_parallel=1 it only returns the smallest skewing (best) possible for this use case.
     * 
     * In case of a lack of dependencies within the scope of fused_computations, or in case of some dependencies impossible to solve,
     * the output should be 3 empty vectors.
    */
    std::tuple<
      std::vector<std::tuple<int,int,int,int>>,
      std::vector<std::tuple<int,int,int,int>>,
      std::vector<std::tuple<int,int,int,int>>> skewing_local_solver_positive(std::vector<tiramisu::computation *> fused_computations,
                                                          tiramisu::var outer_variable,tiramisu::var inner_variable, int nb_parallel);
                                                          
    /**
     * Computes the best legal 3D skewing parameters for 3 use cases (outer parallelism, innermost parallelism and identity).
     * This method also make sure the dependencies becomes positive with skewing in order to enable Tiling. 
     * The method relies fully on the dependence analysis result, so the  method \p perform_full_dependency_analysis() must be invoked before.
     * To correctly invoke this method : schedules must be aligned (same out dimension size) and ordered,
     * so invoking \p prepare_schedules_for_legality_checks() method before is mandatory. 
     * The output of this method is a tuple of vectors, each vector represent a usecase,
     * the elements of the vector are the 9 skewing parameters (3 for 1st dimension, 3 for second, then 3 for the third)
     * 
     * 
     * First vector contains either 1 set of parameters that allows parallism on outer_variable, or an empty vector.
     * Second vector contains a vector of parameters that enables parallism on inner_variable.
     * Third vector contains a vector of parameters that enables tiling and makes the dependencies positive for identity (alpha=1, beta=0).
     * 
     * Note that for all 3 cases the dependencies becomes positive in all 3 dimensions which will enable tiling 3D.
     * In case of a lack of dependencies within the scope of fused_computations, or in case of some dependencies impossible to solve,
     * the output should be 3 empty vectors.
    */
    std::tuple<
      std::vector<std::tuple<int,int,int,int,int,int,int,int,int>>,
      std::vector<std::tuple<int,int,int,int,int,int,int,int,int>>,
      std::vector<std::tuple<int,int,int,int,int,int,int,int,int>>> skewing_local_3D_solver_positive(
                                                  std::vector<tiramisu::computation *> fused_computations, tiramisu::var var_outer,
                                                  tiramisu::var var2, tiramisu::var var_inner);
                                                          
    std::vector<int> extract_transformation_coeffcients(isl_basic_map * transformation, int position);

    std::tuple<int,int,int,int,int,int,int,int,int> extract_3d_skewing_params(isl_basic_map * transformation);

};


/**
  * A class that represents buffers.
  *
  * Buffers have two use cases:
  * - used to store the results of computations, and
  * - used to represent input arguments to functions.
  */
class buffer
{
    friend computation;
    friend function;
    friend generator;
    friend cuda_ast::generator;

private:
    /**
     * A boolean that indicates whether a buffer is a dummy buffer or not (used as default value for a buffer).
     */
    bool is_dummy;

    /**
     * A boolean that indicates whether a buffer is allocated or not.
     */
    bool allocated;

    /**
      * Type of the argument (if the buffer is an argument):
      * Three possible types:
      *  - a_input: for inputs of the function,
      *  - a_output: for outputs of the function,
      *  - a_temporary: for buffers used as temporary buffers within
      *  the function (any temporary buffer is allocated automatically by
      *  the Tiramisu runtime at the entry of the function and is
      *  deallocated at the exit of the function).
      */
    tiramisu::argument_t argtype;

    /**
     * A boolean indicating whether the buffer should be allocated
     * automatically by Tiramisu.
     */
    bool auto_allocate;

    /**
     * A boolean indicating whether the buffer should be allocated
     * automatically by Tiramisu.
     */
    bool auto_deallocate = true;

    /**
     * automatic_gpu_copy = true by default, is it set to false when the user wants
     * to do data transfert to gpu manually.
     */
    bool automatic_gpu_copy;

    /**
     * automatic_flexnlp_copy = true by default, is it set to false when the user wants
     * to do data transfert to flexnlp manually.
     */
    bool automatic_flexnlp_copy;

    /**
      * The sizes of the dimensions of the buffer.  Assuming the following
      * buffer buf[N0][N1][N2], dim_sizes should be {N0, N1, N2}.
      */
    std::vector<tiramisu::expr> dim_sizes;

    /**
      * The tiramisu function where this buffer is declared or where the
      * buffer is an argument.
      */
    tiramisu::function *fct;

    /**
      * The name of the buffer.
      * Buffer names should not start with _ (an underscore).
      * Names starting with _ are reserved names.
      */
    std::string name;

    /**
      * The type of the elements of the buffer.
      */
    tiramisu::primitive_t type;

    /**
     * The location of the buffer (host if in memory, else where on GPU).
     */
    cuda_ast::memory_location location;

protected:
    /**
     * Set the type of the argument. Three possible types exist:
     *  - a_input: for inputs of the function,
     *  - a_output: for outputs of the function,
     *  - a_temporary: for buffers used as temporary buffers within
     *  the function (any temporary buffer is allocated automatically by
     *  the Tiramisu runtime at the entry of the function and is
     *  deallocated at the exit of the function).
     */
    void set_argument_type(tiramisu::argument_t type);

    /**
      * Return whether the buffer should be allocated automatically.
      */
    bool get_auto_allocate();

    /**
      * Return whether the buffer should be allocated automatically.
      */
    bool get_auto_deallocate();

    /**
      * Return whether the copy should be done automatically to the gpu device.
      */
    bool get_automatic_gpu_copy();

    /**
      * Return whether the copy should be done automatically to the flexnlp device.
      */
    bool get_automatic_flexnlp_copy();

    /**
     * Set the size of a dimension of the buffer.
     */
    void set_dim_size(int dim, int size);

public:
    /**
      * \brief Default tiramisu constructor
      */
    buffer();

    /**
      * \brief Create a tiramisu buffer.
      *
      * \details
      *
      * A Tiramisu buffer is equivalent to an array in C.
      *
      * Buffers have two use cases:
      * - Used to store the results of computations, and
      * - Used to represent input arguments to functions.
      *
      * \p name is the name of the buffer.
      *
      * \p dim_sizes is a vector of tiramisu expressions that represent the
      * size of each dimension in the buffer.
      * Assuming we want to declare the buffer buf[N0][N1][N2],
      * then the vector of sizes should be {N0, N1, N2}.
      * Buffer dimensions in Tiramisu have the same semantics as in
      * C/C++.
      *
      * \p type is the type of the elements of the buffer.
      * It must be a primitive type (i.e. p_uint8, p_uint16, ...).
      * Possible types are declared in \ref tiramisu::primitive_t
      * (in type.h).
      *
      * \p argt indicates whether this buffer is an input or an output
      *  buffer and whether it should be allocated automatically by Tiramisu.
      *  The possible types (\ref tiramisu::argument_t) are:
      *
      *  - tiramisu::a_input: indicates that the buffer is supposed
      * to be passed as input to the function generated by Tiramisu.
      *
      *  - tiramisu::a_output: indicates that the buffer is supposed
      * to be passed as output to the function generated by Tiramisu.
      * Input an output buffers should be allocated by the user
      * before calling the Tiramisu generated function.  Tiramisu
      * does not allocated memory for input and output buffers.
      *
      *  - tiramisu::a_temporary: this buffer is not supposed to
      * be passed to the function as an argument.  It will be
      * allocated automatically by the Tiramisu compiler at
      * the beginning of the function and deallocated at the end
      * of the function.  The user can also allocate the buffer
      * "manually" by setting the automatic allocation to false for
      * this buffer.
      *
      * \p fct is a pointer to a Tiramisu function where the buffer is
      * declared or used.  If this argument is not provided (which is
      * the common case), the function that was created automatically
      * during Tiramisu initialization will be used (we call that
      * function the "implicit function").
      *
      * \p corr is the name of the cpu buffer corresponding to a gpu buffer.
      * This field is only set, when we create a gpu buffer.
      *
      * Buffer names should not start with _ (an underscore).
      * Names starting with _ are reserved names.
      */
    buffer(std::string name, std::vector<tiramisu::expr> dim_sizes,
           tiramisu::primitive_t type, tiramisu::argument_t argt,
           tiramisu::function *fct = global::get_implicit_function(),
           std::string corr = "");

    /**
      * Return if this buffer is a dummy buffer or not (default buffer used for default buffer parameter values)
      */
    bool get_is_dummy();

    /**
     * \brief Indicate when to allocate the buffer (i.e., the schedule).
     *
     * \details The buffer is allocated in the same loop of the computation \p C
     * at the loop level \p level (but the order between the two is not
     * specified).
     *
     * For example, let's assume that buf0 is a buffer, and let's assume
     * that we have three computations C1, C2 and C3 scheduled as follow
     *
     * \code
     * for (i=0; i<N; i++)
     *      for (j=0; j<N; j++)
     *           for (k=0; k<N; k++)
     *              C1;
     *
     * for (i=0; i<N; i++) // level 0
     *      for (j=0; j<N; j++) // level 1
     *           for (k=0; k<N; k++) // level 2
     *           {
     *              C2;
     *              C3;
     *           }
     * \endcode
     *
     * The following Tiramisu code
     *
     * \code
     * tiramisu::computation *C4 = buf0.allocate_at(C2, j);
     * C4->before(C2, j);
     * \endcode
     *
     * would allocate buf0 in the loop surrounding C2 at the loop
     * level 0. The allocation computation is called C4, where
     * C4 is scheduled to execute before C2 at the loop level 1.
     * The generated code would look like the following code:
     *
     * \code
     * for (i=0; i<N; i++)
     *      for (j=0; j<N; j++)
     *           for (k=0; k<N; k++)
     *              C1;
     *
     * for (i=0; i<N; i++) // level 0
     *      for (j=0; j<N; j++) // level 1
     *      {
     *           allocate(buf0, buffer_size, buffer_type);
     *           for (k=0; k<N; k++) // level 2
     *           {
     *              C2;
     *              C3;
     *           }
     *      }
     * \endcode
     *
     */
    //@{
    tiramisu::computation *allocate_at(tiramisu::computation &C, tiramisu::var level);
    tiramisu::computation *allocate_at(tiramisu::computation &C, int level);
    //@}

    /**
     * \brief Indicate when to deallocate the buffer (i.e., the schedule).
     *
     * \details The buffer is deallocated in the same loop of the computation \p C
     * at the loop level \p level (but the order between the two is not
     * specified).
     *
     * For example, let's assume that buf0 is a buffer, and let's assume
     * that we have three computations C1, C2 and C3 scheduled as follow
     *
     * \code
     * for (i=0; i<N; i++)
     *      for (j=0; j<N; j++)
     *           for (k=0; k<N; k++)
     *              C1;
     *
     * for (i=0; i<N; i++) // level 0
     *      for (j=0; j<N; j++) // level 1
     *           for (k=0; k<N; k++) // level 2
     *           {
     *              C2;
     *              C3;
     *           }
     * \endcode
     *
     * The following Tiramisu code
     *
     * \code
     * tiramisu::computation *C4 = buf0.allocate_at(C2, j);
     * C4->before(C2, j);
     * tiramisu::computation *C5 = buf0.deallocate_at(C3, j);
     * C5->after(C3, j)
     * \endcode
     *
     * would allocate buf0 in the loop surrounding C2 at the loop
     * level 0. The allocation computation is called C4, where
     * C4 is scheduled to execute before C2 at the loop level 1.
     * And would also deallocate buf0 at the end of the loop level 1
     * The generated code would look like the following code:
     *
     * \code
     * for (i=0; i<N; i++)
     *      for (j=0; j<N; j++)
     *           for (k=0; k<N; k++)
     *              C1;
     *
     * for (i=0; i<N; i++) // level 0
     *      for (j=0; j<N; j++) // level 1
     *      {
     *           allocate(buf0, buffer_size, buffer_type);
     *           for (k=0; k<N; k++) // level 2
     *           {
     *              C2;
     *              C3;
     *           }
     *           deallocate(buf0);
     *      }
     * \endcode
     *
     */
    //@{
    tiramisu::computation *deallocate_at(tiramisu::computation &C, tiramisu::var level);
    tiramisu::computation *deallocate_at(tiramisu::computation &C, int level);
    //@}

    /**
      * \brief Dump the function on standard output.
      * \details This functions dumps most of the fields of the buffer class
      * but not all of them.  It is mainly useful for debugging.
      * If \p exhaustive is set to true, all the fields of the buffer
      * class are printed.
      */
    void dump(bool exhaustive) const;

    /**
      * \brief If this buffer is an argument to a tiramisu::function,
      * return the type of the argument
      *
      * \details Three possible types exist:
      *  - tiramisu::a_input: for inputs of the function,
      *  - tiramisu::a_output: for outputs of the function,
      *  - tiramisu::a_temporary: for buffers used as temporary buffers within
      *  the function (any temporary buffer is allocated automatically by
      *  the Tiramisu runtime at the entry of the function and is
      *  deallocated at the exit of the function).
      */
    tiramisu::argument_t get_argument_type() const;

    /**
     * Return the memory location of the buffer.
     */
    cuda_ast::memory_location get_location() const;

    /**
      * Return the name of the buffer.
      */
    const std::string &get_name() const;

    /**
      * Get the number of dimensions of the buffer.
      */
    int get_n_dims() const;

    /**
      * Return the type of the elements of the buffer.
      */
    tiramisu::primitive_t get_elements_type() const;

    /**
      * Return the sizes of the dimensions of the buffer.
      */
    const std::vector<tiramisu::expr> &get_dim_sizes() const;

    /**
      * Set whether the buffer should be allocated automatically.
      */
    void set_auto_allocate(bool auto_allocation);

    /**
    * Sets whether the buffer should be deallocated automatically.
    */
    void set_auto_deallocate(bool auto_deallocation);

    /**
      * Set whether the GPU copy should be done automatically.
      */
    void set_automatic_gpu_copy(bool automatic_gpu_copy);

    /**
      * Set whether the FlexNLP copy should be done automatically.
      */
    void set_automatic_flexnlp_copy(bool automatic_flexnlp_copy);

    /**
     * Return true if all extents of the buffer are literal integer
     * contants (e.g., 4, 10, 100, ...).
     */
    bool has_constant_extents();

    /**
     * Return true if a statement that allocates the buffer was
     * already generated.
     */
    const bool is_allocated() const;

    /**
     * Mark an array as already allocated.
     */
    void mark_as_allocated();

    /* Tag the buffer as located in the GPU global memory. */
    void tag_gpu_global();
    /** Tag the buffer as located in the GPU register memory.
      * The buffer needs to have a unique dimension of size 1 (i.e. needs to be a scalar).*/
    void tag_gpu_register();
    /* Tag the buffer as located in the GPU shared memory. */
    void tag_gpu_shared();
    /* Tag the buffer as located in the GPU local thread memory. */
    void tag_gpu_local();
    /* Tag the buffer as located in the GPU constant memory. */
    void tag_gpu_constant();
};

/**
  * \brief A class that represents computations.
  *
  * \details A computation is an expression associated with an iteration domain.
  * A computation indicates what needs to be computed (the expression
  * that should be computed).
  * A computation has three representations:
  * - Level I: this level specifies "what" should be computed but does
  *   not specify "when" (order) and "where" (on which processor) each
  *   expression should be computed.
  *   This level also does not specify where computations should be stored
  *   in memory and in which data layout.
  * - Level II: this level specifies "what" should be computed, "when", i.e.
  *   the order in which the computation should be executed with regard to
  *   the other computations. And "where" each computation should be
  *   computed (i.e., on which processor).
  *   This level still does not specify where computations should be stored
  *   in memory and their data layout.
  * - Level III: this level is similar to Level 2 but it specifies where
  *   computations should be stored in memory and the data layout.
  */
class computation
{
    friend input;
    friend function;
    friend generator;
    friend buffer;
    friend constant;
    friend sync;
    friend computation_tester;
    friend send;
    friend recv;
    friend tiramisu::wait;
    friend cuda_ast::generator;
    friend auto_scheduler::syntax_tree;
    friend auto_scheduler::ast_node;
    friend auto_scheduler::computation_info;
    friend auto_scheduler::evaluate_by_execution;
    friend auto_scheduler::state_computation;
    friend auto_scheduler::ml_model_schedules_generator;
    friend void auto_scheduler::unroll_innermost_levels(std::vector<tiramisu::computation*> const& comps_list, int unroll_fact);

private:

    /**
      * Access function.  A map indicating how each computation should be stored
      * in memory.  It indicates in which buffer the computation should be stored
      * and which element of the buffer exactly it should be stored.
      */
    isl_map *access;

    /**
      * A vector that contains the list of let statements associated
      * with this computation.
      *
      * A let statement that is associated with the computation is a let statement
      * that will be added just before the computation.  The scope of the variable
      * defined by the let statement is this computation alone. i.e., it is not
      * defined in other computations.
      */
    std::vector<std::pair<std::string, tiramisu::expr>> associated_let_stmts;

    /**
     * The buffer attached "automatically" to this computation.
     * If the buffer is not created automatically, this variable will be empty.
     */
    tiramisu::buffer *automatically_allocated_buffer;

    /**
      * An ISL context associate with the function.
      */
    isl_ctx *ctx;

    /**
      * Data type: the type of the value returned by the computation.
      */
    tiramisu::primitive_t data_type;

    /**
      * Number of definitions added to this computation. Each time the function
      * add_definitions is called, definitions_number is incremented.
      */
    int definitions_number;

    /**
      * The ID of this definition. Each new computation when created has a
      * definition_ID equal to 0.  When a new definition is added, its ID
      * is 1, when a new definition is added, its definition ID is set to 2,
      * ...
      */
    int definition_ID;

    /**
     * An integer indicating the number of duplicates of this computation.
     * We use this to set the duplicate ID of any newly created computation.
     * Whenever a new duplicate is create, this number is incremented.
     */
    int duplicate_number;

    /**
      * An expression representing the computation
      * ("what" should be computed).
      */
    tiramisu::expr expression;

    /**
      * If has_multiple_definitions() is true, then this variable contains the
      * computation that was defined first among all the multiple definitions.
      */
    tiramisu::computation *first_definition;

    /**
      * The function where this computation is declared.
      */
    tiramisu::function *fct;

    /**
      * An isl_ast_expr representing the index of the array where the computation
      * will be stored.  This index is computed after the scheduling is done.
      */
    std::vector<isl_ast_expr *> index_expr;

    /**
     * A map between the original names of the iterators of a computation
     * and their transformed form after schedule (also after renaming).
     *
     * If in the original computation, we had
     *
     * {C[i0, i1]: ...}
     *
     * And if in the generated code, the iterators are called c0, c1, c2 and c3 and
     * the loops are tiled, then the map will be
     *
     * {<i0, c0*10+c2>, <i1, c1*10+c3>}.
     */
    std::map<std::string, isl_ast_expr *> iterators_map;

    /**
      * Does this computation represent a let statement ?
      *
      * Let statements should be treated differently:
      * - During Halide code generation a Halide let statement should be
      * created instead of an assignment statement.
      * - A let statement does not have/need an access function because
      * it writes directly to a scalar.
      * - When targeting Halide, let statements should be created after
      * their body is created, because the body is an argument needed
      * for the creation of the let statement.
      */
    bool is_let;

    /**
      * This is variable is set to true if this computation is the first definition.
      * It is set to false if has_multiple_definitions() is true but this computation
      * is not the first one defined.
      * This is useful because in tiramisu, we assume that all the computations that
      * have the same name must have the same memory access. Thus any new definition
      * of a computation must copy the same memory access as the first definition, thus
      * we need to know which computation is the first definition.
      */
    bool is_first;

    /**
      * Return true if this computation is the first definition.
      * It returns false if has_multiple_definitions() is true but this computation
      * is not the first one defined.
      * This is useful because in tiramisu, we assume that all the computations that
      * have the same name must have the same memory access. Thus any new definition
      * of a computation must copy the same memory access as the first definition, thus
      * we need to know which computation is the first definition.
      */
    bool is_first_definition();

    /* Is true if the the computation is inline. */
    bool is_inline;

    /**
      * Iteration domain of the computation.
      * In this representation, the order of execution of computations
      * is not specified, the computations are also not mapped to memory.
     */
    isl_set *iteration_domain;

    /**
      * The name of this computation.
      * Computation names should not start with _ (an underscore).
      * Names starting with _ are reserved names.
      */
    std::string name;

    /* The number of dimensions in the original definition of the computation. */
    int number_of_dims;

    /**
     * A predicate around the computation. The computation is executed
     * only if this predicate is true. This is useful to insert a non-affine
     * condition around the computation.
     */
    tiramisu::expr predicate;

    /**
      * The schedules of the computation.
      */
    isl_map * schedule;

    /**
      * TODO: use buffers directly from computations, no need to have
      * bindings.
      *
      * \p schedule_this_computation should be set to true when the computation
      * should be scheduled and when code for the computation should be generated
      * during code generation.
      * It should be set to false when the computation is used to represent a
      * buffer (i.e., the computation is used only as a binding to a buffer).
      * In this case, the computation is not scheduled and no code for the
      * computation is generated.
      */
    bool schedule_this_computation;

    // Private class members that are computed during code generation.

    /**
      * Halide statement that assigns the computation to a buffer location.
      */
    Halide::Internal::Stmt stmt;

    /**
      * Time-processor domain of the computation.
      * In this representation, the logical time of execution and the
      * processor where the computation will be executed are both
      * specified.
      */
    isl_set *time_processor_domain;

    /**
     * The shape of the thread block that this computation is mapped to in case
     * a gpu_tile operation is done.
     */
    std::vector<int> thread_block_shape;

    /**
      * True if the computation is executed in a nonblocking or asynchronous way.
      */
    bool _is_nonblock_or_async;

    /**
     * True if  the rank should be dropped from index linearization.
     */
    bool _drop_rank_iter;

    /**
     * If _drop_rank_iter == true, this is the level to drop
     */
    var drop_level;

    /**
      * If the computation represents a library call, this will contain the
      * necessary arguments to the function.
      */
    std::vector<tiramisu::expr> library_call_args;

    /**
      * If the computation represents a library call and requires a linear index (for either a LHS or RHS access),
      * this indicates which argument to the function represents the linear index.
      */
    // @{
    int lhs_argument_idx;
    int rhs_argument_idx;
    // @}

    /**
      * If the computation represents a library call and requires a wait, this indicates which argument to the function
      * represents the linear index.
      */
    int wait_argument_idx;

    /**
      * If the computation represents a library call and requires a wait, this is like a regular access map, but gives
      * an access into a special wait buffer.
      */
    isl_map *wait_access_map = nullptr;

    /**
      * By default, this becomes an o_access to signify that we are writing into a location. However, it can be changed
      * to something like o_address_of to indicate that we don't want to do a store to the location, rather we just
      * want to get a pointer to the store location.
      */
    // @{
    tiramisu::op_t lhs_access_type;
    tiramisu::op_t rhs_access_type;
    // @}

    // ADD:FLEXNLP
    /**
      * This variable contains the computation's iteration variables
      */
    std::vector<tiramisu::var> iteration_variables;

    /**
      * Returns the iteration_variables vector containing each of the
      * computation's iteration variables
      */
    std::vector<tiramisu::var> get_iteration_variables();

    /**
      * Apply a transformation on the domain of the schedule.
      * This is a transformation from iteration domain to the time-processor
      * domain.
      *
      * For example, to apply to shift the i dimension of the iteration domain
      * of C0, you can apply the transformation
      *
      * C0[i, j] -> C0[i+2, j]
      */
    void apply_transformation_on_schedule_domain(std::string map_str);

    /**
      * Add the set of constraints \p domain_constraints to the domain
      * of the schedule and add the set of constraints \p range_constraints
      * to the range of the schedule.
      *
      * \p domain_constraints and \p range_constraints are both ISL sets in
      * the ISL string format.
      */
    void add_schedule_constraint(std::string domain_constraints, std::string range_constraints);

    /**
      * Check that the names used in \p dimensions are not already
      * in use.
      */
    void assert_names_not_assigned(std::vector<std::string> dimensions);

    /**
      * Return true if a buffer was allocated to this computation or to one
      * of its updates (we assume that we allocate the same buffer for the
      * computation and its updates).
      */
    bool buffer_already_allocated();

    /**
      * Identical to
      *     void before(computation &consumer, tiramisu::var L);
      * Except that the loop level in this case is identified using its number. This is
      * used mainly internally by Tiramisu while the other is designed to be user friendly.
      */
    void before(computation &consumer, int L);

    /**
      * Check that the \p dimensions are valid:
      * - The dimension numbers are within the bounds of accepted dimensions
      * (i.e., between computation::root_dimension and the maximal dimension
      * number in the time-space domain.
      */
    void check_dimensions_validity(std::vector<int> dimensions);

    /**
     * Compute two subsets of computations:
     *  - the first is the subset of needed computations,
     *  - the second is the subset of produced computations,
     *
     * The needed computations are those that need to be consumed
     * by the \p consumer if this computation is to be executed in the
     * same loop as the consumer but at loop level \p L.
     *
     * The produced subset represent are the computations produced by
     * this computation at the level \p L.
     *
     * This function takes in consideration the schedule of
     * this computation and the schedule of consumer in order to
     * compute the needed area.
     *
     * This function creates new parameters, these parameters are used to specialize
     * the needed area for specific loop iterators (i.e., fix the needed area).
     * These parameters are stored in \p param_names.
     * This vector is used internally by Tiramisu in compute_at() which calls this
     * function.
     */
    std::vector<isl_set *> compute_needed_and_produced(computation &consumer, int L,
                                                       std::vector<std::string> &param_names);


    /**
      * Take a list of iterators and a name as input and construct the iteration
      * domain that iterates over those iterators.  The name of the statement
      * of the iteration domain is \p name.
      * The iteration domain is a string in ISL format.
      * \p predicate is an expression that represents constraints on the iteration domain
      * (for example (i != j). The predicate has to be an affine
      * expression.
      */
    std::string construct_iteration_domain(std::string name, std::vector<var> iterator_variables, tiramisu::expr predicate);

    /**
      * Create a copy of this computation.
      */
    tiramisu::computation *copy();

    /**
      * Create a Halide statement that assigns the computations to the memory
      * buffer and location specified by the access function.
      */
    void create_halide_assignment();

    /**
      * Returns a pair of expressions (LHS, RHS) representing the assignment performed
      * by the computation.
      * The LHS would represent a memory buffer access, and the RHS would represent
      * the value of the assignment.
      */
    std::pair<expr, expr> create_tiramisu_assignment(std::vector<isl_ast_expr *> &index_expr);

    /**
      * Apply a duplication transformation from iteration space to
      * time-processor space.
      * A duplication transformation duplicates the original computation,
      * so the domain of the schedule has to be the iteration domain of
      * the original computation.
      *
      * For example, to duplicate C0 into a first duplicate:
      *
      * C0[i, j] -> C0[1, 0, i, 0, j, 0]
      *
      * To duplicate C0 again
      *
      * C0[i, j] -> C0[2, 0, j, 0, i, 0]
      *
      */
    void create_duplication_transformation(std::string map_str);

    /*
     * Create a new Tiramisu constant M = v*floor(N/v) and use it as
     * a separator.
     *
     * The separator is used to separate a computation. That
     * is, it is used to create two identical computations where we have
     * a constraint like i<M in the first and i>=M in the second.
     * The first is called the full computation while the second is called
     * the separated computation.
     *
     * This function is used in vectorize and unroll mainly.
     */
    tiramisu::constant *create_separator(const tiramisu::expr &loop_upper_bound, int v);

    /**
       * Duplicate a part of this computation (or all of it) and return
       * the duplicate computation.  The duplicate computation is identical
       * to this computation in every aspect except that its iteration domain
       * is the intersection of the iteration domain of the original computation
       * and the domain provided in \p domain_constraints.
       *
       * Example: let us assume that you have the following computation
       *
       * {C0[i,j]: 0<=i<N and 0<=j<N}
       *
       * If you want to create a duplicate of this computation
       * that has the following iteration domain
       *
       *          "{C0[i,j]: 0<=i<=10 and 0<=j<=5"
       *
       * you can write
       *
       * C0.duplicate("{C0[i,j]: 0<=i<=10 and 0<=j<=5");
       *
       * This will keep the original computation C0 and will create a
       * new computation (duplicate of C0) that has
       * "{C0[i,j]: 0<=i<=10 and 0<=j<=5". as iteration domain.
       * (duplicate() in fact intersects the domain provided with
       * the original domain).
       *
       * The duplicate computation is an exact copy of the original
       * computation except in one thing:
       * The iteration domain of the duplicate computation is
       * the intersection of the iteration domain of the original
       * computation with the constraints provided as an argument
       * to the duplicate command; this means that the iteration domain
       * of the duplicate computation is always a subset of the original
       * iteration domain;
       *
       * If you schedule a computation C, then duplicate it, then the duplicate
       * will have exactly the same schedule as the original computation.
       * After duplication, any schedule applied on the original computation
       * will not be applied automatically on the duplicate computation. They
       * become two separate computations that need to be scheduled separately.
       *
       * \p domain_constraints is a set of constraints on the iteration domain that
       * define the duplicate.
       * \p range_constraints is a set of constraints on the time-processor domain
       * that define the duplicate.
       */
     tiramisu::computation *duplicate(std::string domain_constraints,
             std::string range_constraints);

    /**
      * Return the access function of the computation.
      */
    isl_map *get_access_relation() const;

    /**
      * Return the access function of the computation after transforming
      * it to the time-processor domain.
      * The domain of the access function is transformed to the
      * time-processor domain using the schedule, and then the transformed
      * access function is returned.
      */
    isl_map *get_access_relation_adapted_to_time_processor_domain() const;

    /**
      * Return vector of associated let statements.
      *
      * This is a vector that contains the list of let statements
      * associated with this computation.
      *
      * A let statement that is associated with the computation is a
      * let statement that will be added just before the computation.
      * The scope of the variable defined by the let statement is this
      * computation alone. i.e., it is not defined in other computations.
      */
    const std::vector<std::pair<std::string, tiramisu::expr>> &get_associated_let_stmts() const;

    /**
     * Get the name of dynamic dimension that corresponds to the
     * \p loop_level in the time-space domain.
     */
    std::string get_dimension_name_for_loop_level(int loop_level);

    /**
      * Return the number of the duplicates of this computation.
      *
      * The number of duplicates is incremented if the computation is duplicated using
      * the duplicate() function.
      */
    int get_duplicates_number() const;

    /**
      * If has_multiple_definitions() is true, then this function returns the
      * computation that was defined first among all the multiple definitions.
      */
    tiramisu::computation *get_first_definition();

    /**
      * Return the function where the computation is declared.
      */
    tiramisu::function *get_function() const;

    /**
      * Return the Halide statement that assigns the computation to a buffer location.
      * Before calling this function the user should first call Halide code generation
      * (function::gen_halide_stmt()).
      */
    Halide::Internal::Stmt get_generated_halide_stmt() const;

    /**
      * Return vector of isl_ast_expr representing the indices of the array where
      * the computation will be stored.
      */
    std::vector<isl_ast_expr *> &get_index_expr();

    /**
     * Get the union of the iteration domains of this computations and
     * all the other definitions of this computations (updates,
     * duplicates, ...).
     */
    isl_set *get_iteration_domains_of_all_definitions();


    /**
     * The iterators map is map between the original names of the iterators of a computation
     * and their transformed form after schedule (also after renaming).
     *
     * If in the original computation, we had
     *
     * {C[i0, i1]: ...}
     *
     * And if in the generated code, the iterators are called c0, c1, c2 and c3 and
     * the loops are tiled, then the map will be
     *
     * {<i0, c0*10+c2>, <i1, c1*10+c3>}.
     */
    std::map<std::string, isl_ast_expr *> get_iterators_map();

    /**
      * Return the names of iteration domain dimensions.
      */
    std::vector<std::string> get_iteration_domain_dimension_names();

    /**
      * Get the number of dimensions of the iteration
      * domain of the computation.
      */
    int get_iteration_domain_dimensions_number();

    /**
      * Return the names of loop levels.
      * (i.e., the names of dynamic dimensions in time-space).
      */
    std::vector<std::string> get_loop_level_names();

    /**
      * Get the number of loop levels.
      */
    int get_loop_levels_number();

    /**
     * Gets the span of the loop level \p level (number of iterations) and returns
     * it as a tiramisu::expr.
     */
    expr get_span(int level);

    /**
      * Return the root of the tree of definitions of this computation.
      * The tree of definitions is the tree that emerges after adding multiple
      * definitions to the same computation. Let's take the following example.
      * Assuming we have the computation c0.  We can add a definition c1 to c0.
      * The tree in this case would have two nodes, c0 and c1. c0 is the root.
      * We can add another defintion c2 to c0. Then we can add another defintion c3
      * to c1. In this case the tree would have four nodes where c0 is the root.
      */
    tiramisu::computation *get_root_of_definition_tree();

    /**
      * Get the number of dimensions of the time-space
      * domain of the computation.
      */
    int get_time_space_dimensions_number();

    /**
      * Trim the union of schedules of the computation and
      * return the result.
      * The trimmed union of schedules is the schedule without the
      * duplicate dimension (the dimension used to indicate the
      * duplicate ID).
      */
    isl_map *get_trimmed_union_of_schedules() const;

    /**
      * Return the time-processor domain of the computation.
      * In this representation, the logical time of execution and the
      * processor where the computation will be executed are both
      * specified.
      */
    isl_set *get_time_processor_domain() const;

    /**
      * Return the trimmed time-processor domain.
      * The first dimension of the time-processor domain is used
      * to indicate redundancy of the computation.  Computations that
      * are not redundant have 0 in that dimension, whereas
      * computations that are redundant (i.e., are recomputed) have
      * a number different from 0 in that dimension and which represents
      * the ID of the redundant computation.
      * The trimmed time-processor domain is the time-processor domain
      * without the dimension that represents the redundancy.  We simply
      * take the time-processor domain and remove the first dimension.
      */
    isl_set *get_trimmed_time_processor_domain();

    /**
      * Generate an identity schedule for the computation.
      *
      * This identity schedule is an identity relation created from the iteration
      * domain.
      */
    isl_map *gen_identity_schedule_for_iteration_domain();

    /**
      * Generate an identity schedule for the computation.
      *
      * This identity schedule is an identity relation created from the
      * time-processor domain.
      */
    isl_map *gen_identity_schedule_for_time_space_domain();

    /**
      * Returns all updates the have been defined for this computation using
      * add_definitions. The 0th update is a pointer to this computation.
      */
    std::vector<computation*>& get_updates();

    /**
      * Search the time-space domain (the range of the schedule) and
      * return the loop level numbers that correspond to the dimensions
      * named \p dim.
      * In other words, translate the vector of dimension names (\p dim_names)
      * into loop level numbers. We need to do this because the internal Tiramisu
      * scheduling functions use dimension numbers instead of dimension
      * names (which are used in the user level scheduling functions).
      */
    std::vector<int> get_loop_level_numbers_from_dimension_names(std::vector<std::string> dim_names);

    /**
     * Return true if this computation is supposed to have an access to other
     * computations.
     * Knowing this is important so that tiramisu does not try to compute
     * the accesses of the RHS.
     */
    bool has_accesses() const;

    /**
      * Return if this computation represents a let statement.
      *
      * Let statements should be treated differently because:
      * - A let statement does not have/need an access function because
      * it writes directly to a scalar.
      * - If the backend is Halide:
      *      - During Halide code generation a Halide let statement
      *      should be created instead of an assignment statement.
      *      - When targeting Halide, let statements should be created
      *      after their body is created, because the body is an argument
      *      needed for the creation of the let statement.
      */
    bool is_let_stmt() const;

    /**
      * Return true if this computation represents a call to a library function?
      * Mainly used for distributed communication functions.
      */
    bool is_library_call() const;

    /**
      * Return true if the rank loop iterator should be removed from linearization.
      */
    bool should_drop_rank_iter() const;

    int get_level_to_drop();

    /**
      * Assign a name to iteration domain dimensions that do not have a name.
      */
    void name_unnamed_iteration_domain_dimensions();

    /**
      * Assign a name to iteration domain dimensions that do not have a name.
      */
    void name_unnamed_time_space_dimensions();

    /**
      * Set an identity schedule for the computation.
      *
      * This identity schedule is an identity relation created from the iteration
      * domain.
      *
      * This sets the schedule of the original computation
      * and does not set the schedule of any duplicate
      * computation.
      */
    void set_identity_schedule_based_on_iteration_domain();

    /**
      * Return true if the this computation is supposed to be scheduled
      * by Tiramisu.
      */
    bool should_schedule_this_computation() const;

    /**
      * Intersect \p set with the context of the computation.
      */
    // @{
    isl_set *intersect_set_with_context(isl_set *set);
    isl_map *intersect_map_domain_with_context(isl_map *map);
    // @}

    /**
     * Rename this computation and modify the schedule and the access relation
     * accordingly.
     *
     * Computation that are defined multiple times need to be renamed, because
     * those computations in general have different expressions and the code
     * generator expects that computations that have the same name always have
     * the same expression and access relation. So we should rename them to avoid
     * any ambiguity for the code generator.
     */
    void rename_computation(std::string new_name);

    /**
      * Separate the iteration domain into two iteration domains using
      * the constant \p C.
      * Let us assume that the dimension \p dim of the iteration domain
      * is called i.  The iteration domain is separated into two domains
      * using the hyperplane (i = v*floor(N/v)). That means, two copies of the
      * iteration domain are created, the constraint (i<=v*floor(N/v)) is added to
      * the schedule of the first while the constrain (i>v*floor(N/v)) is added to
      * the schedule of the second.
      *
      * Let us assume that we have the following iteration domain
      *
      *   {S0[i,j]: 0<=i<N and 0<=j<M}
      *
      * To separate this iteration domain by the hyperplane j=4*floor(M/4), one should
      * call
      *
      *   S0.separate(1, tiramisu::expr("M"), 4)
      *
      * This will result in the creation of two computations that have the same
      * iteration domains but have different schedules. The schedules are as
      * follows:
      *
      * The schedule of the original (full) computation would be
      * {S0[i,j]->S0[0, 0, i, 0, j, 0]: j<4*floor(M/4)}
      *
      * The schedule of the separated (partial) computation would be
      * {S0[i]->S0[0, 0, i, 10, j, 0]: 4*floor(M/4)<=j}
      *
      * The second computation created using separate can be accessed with
      * get_update().
      */
    void separate(int dim, tiramisu::expr N, int v, tiramisu::expr lower_bound);

    /**
      * Like separate, but the precise separate points are specified in _separate_points. _max is the loop max.
      */
    void separate_at(var level, std::vector<tiramisu::expr> _separate_points, tiramisu::expr _max);

    /**
      * Separate then split the loop \p L0
      * (see tiramisu::computation::separate and tiramisu::computation::split)
      *
      * This function returns true if the split actually happened (in some
      * cases, if the loop cannot be split because it is too small for example,
      * then no split happens even after calling the split function, in such
      * cases the function returns false).
      */
    //@{
    bool separateAndSplit(tiramisu::var L0, int sizeX);
    bool separateAndSplit(tiramisu::var L0, int sizeX,
                          tiramisu::var L0_outer, tiramisu::var L0_inner);
    bool separateAndSplit(int L0, int sizeX);
    //@}

    /**
      * Split the loop level \p L0 of the iteration space into two
      * new loop levels.
      *
      * \p sizeX is the extent (size) of the inner loop created after
      * splitting.
      */
    //@{
    //@}

    /**
      * Set the names of loop levels dimensions.
      * The loop levels are specified using \p loop_levels
      * and their names are specified using \p names.
      * Users can only set the names of loop levels (dynamic dimensions),
      * the static dimension names are set to default names.
      */
    void set_loop_level_names(std::vector<int> loop_levels, std::vector<std::string> names);

    /**
      * Set the names of loop level dimensions.
      * The loop level names are specified using \p names.
      * Users can only set the names of loop levels (dynamic dimensions),
      * the static dimension names are set to default names.
      */
    void set_loop_level_names(std::vector<std::string> names);

    /**
      * Set the names of the dimensions of the schedule domain.
      * The dimension names are specified using \p names, their positions
      * are indicated using \p loop_levels.
      */
    void set_schedule_domain_dim_names(std::vector<int> loop_levels, std::vector<std::string> names);

    /**
      * Set the iteration domain of the computation
      */
    void set_iteration_domain(isl_set *domain);

    /**
     * Set whether a computation has multiple definitions.
     * i.e., whether the computation is defined multiple times (i.e.,
     * whether there are many computations with the same name). An
     * update is a special case of a computation defined multiple
     * times.
     */
    void set_has_multiple_definitions(bool val);

    /**
     * The iterators map is map between the original names of the iterators of a computation
     * and their transformed form after schedule (also after renaming).
     *
     * If in the original computation, we had
     *
     * {C[i0, i1]: ...}
     *
     * And if in the generated code, the iterators are called c0, c1, c2 and c3 and
     * the loops are tiled, then the map will be
     *
     * {<i0, c0*10+c2>, <i1, c1*10+c3>}.
     */
    void set_iterators_map(std::map<std::string, isl_ast_expr *> map);

    /**
      * Identical to
      *      void shift(tiramisu::var L0, int n);
      */
    void shift(int L0, int n);

    /**
      * Simplify \p set using the context and by calling
      * set coalescing.
      */
    // @{
    isl_set *simplify(isl_set *set);
    isl_map *simplify(isl_map *map);
    // @}

    /**
      * Identical to
      * \code
      *    void tag_gpu_level(tiramisu::var L0, tiramisu::var L1);
      *    void tag_gpu_level(tiramisu::var L0, tiramisu::var L1,
      *                       tiramisu::var L2, tiramisu::var L3);
      *    void tag_gpu_level(tiramisu::var L0, tiramisu::var L1,
      *                       tiramisu::var L2, tiramisu::var L3,
      *                       tiramisu::var L4, tiramisu::var L5);
      * \endcode
      * The outermost loop level is 0.
      */
    // @{
    void tag_gpu_level(int L0, int L1);
    void tag_gpu_level(int L0, int L1, int L2, int L3);
    void tag_gpu_level(int L0, int L1, int L2, int L3, int L4, int L5);
    // @}

    /**
      * Tag the loop level \p L0 and \p L1 to be mapped to GPU block
      * dimensions.
      *
      * The outermost loop level is 0.
      */
    // @{
    void tag_gpu_block_level(int L0);
    void tag_gpu_block_level(int L0, int L1);
    void tag_gpu_block_level(int L0, int L1, int L2);
    // @}

    /**
      * Tag the loop level \p L0 and \p L1 to be mapped to GPU thread
      * dimensions.
      *
      * The outermost loop level is 0.
      */
    // @{
    void tag_gpu_thread_level(int L0);
    void tag_gpu_thread_level(int L0, int L1);
    void tag_gpu_thread_level(int L0, int L1, int L2);
    // @}

    /**
      * Contains a list of all definitions added to this computation. The 0th definition is
      * always this very computation.
      */
    std::vector<tiramisu::computation *> updates;

    /**
      * Update loop level names. This function should be called after each scheduling operation
      * because scheduling usually changes the original loop level names.
      * This function erases \p nb_loop_levels_to_erase loop level names starting from the
      * loop level \p start_erasing. It then inserts the loop level names \p new_names in
      * \p start_erasing. In other words, it replaces the names of loop levels from
      * \p start_erasing to \p start_erasing + \p nb_loop_levels_to_erase with the loop levels
      * indicated by \p new_names.  This function sets the non erased loop levels to be equal to the
      * original loop level names.
      *
      * \p original_loop_level_names : a vector containing the original loop level names (loop level
      * names before scheduling).
      *
      * \p new_names : the new loop level names.
      *
      * \p start_erasing : start erasing loop levels from this loop level.
      *
      * \p nb_loop_levels_to_erase : number of loop levels to erase.
      *
      * Example. Assuming the original loop levels are {i0, i1, i2, i3}
      *
      * Calling this->update_names({i0, i1, i2, i3}, {x0, x1}, 1, 2) updates the loop levels to become
      * {i0, x0, x1, i3}.
      */
    void update_names(std::vector<std::string> original_loop_level_names, std::vector<std::string> new_names,
                      int start_erasing, int nb_loop_levels_to_erase);

    /**
      * A vector describing the access variables in the original definition of  a computation.
      * For every named dimension, a pair representing the index of the named dimension
      * and the name of the dimension is added to access_variables.
      * E.g. if a computation is defined as S[i, 0, j], access_variables will contain
      * {(0, "i"), (2, "j")}.
      */
    std::vector<std::pair<int, std::string>> access_variables;

    /**
      * Compare two computations.
      *
      * Two computations are considered to be equal if they have the
      * same name.
      */
    bool operator==(tiramisu::computation comp1);

    /**
      * Return the distributed dimension of a computation.
      */
    int get_distributed_dimension();

    /**
      * Return names of trimmed time space domain dimensions.
      */
    std::vector<std::string> get_trimmed_time_space_domain_dimension_names();

    /**
      * \brief Return a unique identifier for the communication based on the type of
      * ank and a sequential number.
      *
      * All communications of a computation c will have the same prefix b_c followed
      * by the sequential number i and then the type of communication r_snd or r_rcv.
      *
      */
    std::string get_comm_id(rank_t rank_type, int i);

    /**
      * Return the iteration domain of the communication set which can be either
      * a send iteration domain or a receive iteration domain.
      * set is an exchange set which we transform to a recv/send it_dom.
      */
    isl_set* construct_comm_set(isl_set* set, rank_t rank_type, int comm_id);

    /**
      * \brief Return an unordred_map comp_name, set_to_exchange.
      *
      * The set to exchange expresses a subset of the iteration domain of the computation comp_name
      * that needs to be sent by r_sender which owns the data and received by r_receiver
      * which needs this data.
      */
    std::unordered_map<std::string, isl_set*> construct_exchange_sets();

    /**
      * \brief Generate distributed communication code.
      *
      * Given the iteration domain of send and receive, this function creates xfers, schedules them,
      * and handles the storage of the receives.
      * Currently, this process works for programs that distribute the outermost loop.
      */
    void gen_communication_code(isl_set*recv_it, isl_set* send_it, int communication_id, std::string computation_name);

protected:

    /**
      * \brief Construct the distribution map of a computation.
      *
      * A distribution map partitions a computation across ranks.
      * Given the type of the rank rank_type which is either r_sender or r_receiver,
      * the distribution map will specify for each rank the iterations it should execute.
      *
      * Example :
      * \code
      * c.set_expression(in(i)+in(i-1));
      * c.tag_distribute_level(i);
      * /endcode
      *
      * If the function is called on c, it will construct the following map:
      *
      * \code
      * [r] -> { c[i] -> c[o0] : o0 = i and r >= 0 and r <= 4 and i >= 2r and i <= 1 + 2r }
      * /endcode
     */
    isl_map* construct_distribution_map(tiramisu::rank_t rank_type);

    /**
      * True if this computation represents a library call.
      */
    bool _is_library_call;

    /**
      * If the computation represents a library call, this is the name of the function.
      */
    std::string library_call_name;

    /**
      * An index expression just for the request buffer.
      */
    isl_ast_expr *wait_index_expr;

    /**
      * Dummy constructor for derived classes.
      */
    computation();

    /**
      * Compute the size of the buffer allocated automatically to hold the
      * results of this computation.
      */
    std::vector<tiramisu::expr> compute_buffer_size();

    /**
      * Return the context of the computations.
      */
    isl_ctx *get_ctx() const;

    /**
     * Return the predicate around this computation if a predicate
     * was added using add_predicate().
     */
    tiramisu::expr get_predicate();

    /**
      * Return a unique name of computation; made of the following pattern:
      * [computation name]@[computation address in memory]
      */
    const std::string get_unique_name() const;

    /**
     *
     * Return true if the computation has multiple definitions.
     * i.e., if the computation is defined multiple times.
     * An update is a special case where a computation is defined
     * multiple times.  Duplicate computations are another example.
     *
     * In the following example, C is defined multiple times whereas
     * D is defined only once.
     *
     * \code
     *
     * C(0) = 0
     *
     * C(i) = C(i-1) + 1
     *
     * D(i) = C(i) + 1
     *
     * \endcode
     */
    bool has_multiple_definitions();

    /**
      * Initialize a computation.
      * This is a private function that should not be called explicitly
      * by users.
      */
    void init_computation(std::string iteration_space_str,
                          tiramisu::function *fct,
                          const tiramisu::expr &e,
                          bool schedule_this_computation,
                          tiramisu::primitive_t t);

    /**
      * Set the name of the computation.
      */
    void set_name(const std::string &n);

    /**
      * Set the schedule indicated by \p map.
      *
      * \p map is a string that represents a mapping from the iteration domain
      * to the time-space domain (the ISL format to represent maps is
      * documented in http://barvinok.gforge.inria.fr/barvinok.pdf in Sec 1.2.2).
      *
      * The schedule is a map from the iteration domain to a time space
      * domain. The same name of space should be used for both the range
      * and the domain of the schedule.
      *
      * In general, users can set the schedule using high level functions such
      * as before(), after(), tile(), compute_at(), vectorize(), split(), ...
      * The use of this function is only reserved for advanced users who want
      * a low level control of the schedule.
      *
      * Vectors in the time-space domain have the following form
      *
      * computation_name[redundancy_ID,static,dynamic,static,dynamic,static,dynamic,static,...]
      *
      * The first dimension of the vector is used to indicate the redundancy ID
      * (the notion of the redundancy ID is explained later).
      *
      * The following dimensions are interleaved dimensions: static, dynamic, static,
      * dynamic, ...
      * Dynamic dimensions represent the loop levels, while static dimensions are
      * used to order statements within a given block of statements in a given loop
      * level.
      * For example, the computations c0 and c1 in the following loop nest
      *
      * \code
      * for (i=0; i<N: i++)
      *   for (j=0; j<N; j++)
      *   {
      *     c0;
      *     c1;
      *   }
      *
      * \endcode
      *
      * have the following representations in the iteration domain
      *
      * \code
      * {c0[i,j]: 0<=i<N and 0<=j<N}
      * {c1[i,j]: 0<=i<N and 0<=j<N}
      * \endcode
      *
      * and the following representation in the time-space domain
      *
      * \code
      * {c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
      * {c1[0,0,i,0,j,1]: 0<=i<N and 0<=j<N}
      * \endcode
      *
      * The first dimension (dimension 0) in the time-space
      * domain (the leftmost dimension) is the redundancy ID
      * (in this case it is 0, the meaning of this ID will be explained later).
      * The second dimension (starting from the left) is a static dimension,
      * the third dimension is a dynamic dimension that
      * represents the loop level i, ..., the fifth dimension is a dynamic
      * dimension that represents the loop level j and the last dimension
      * (dimension 5) is a static dimension and allows the ordering of
      * c1 after c0 in the loop nest.
      *
      * To transform the previous iteration domain to the
      * time-space domain, the following schedule should be used:
      *
      * \code
      * {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
      * {c1[i,j]->c1[0,0,i,0,j,1]: 0<=i<N and 0<=j<N}
      * \endcode
      *
      * The first dimension called "redundancy ID" is only meaningful if the
      * computation is redundant. i.e., some parts of the computation are
      * redundantly computed.  Redundant computations are in general used to
      * maximize parallelism and data locality on the expense of doing some
      * computations redundantly.
      * For example, if the two computations c1(i,j) and c2(i,j) both depend
      * on the computation c0(i,j), instead of waiting for c0(i,j) to be
      * computed and then computing c1(i,j) and c2(i,j) in parallel, the thread
      * executing c1(i,j) can compute c0(i,j) by itself and then run c1(i,j).
      * The thread that computes c2(i,j) can do the same and compute c0(i,j)
      * by itself and then compute c2(i,j). In this case the two threads do not
      * need to wait. This is done at the expense of redundant computation since
      * c0(i,j) is computed by both threads.
      *
      * In general redundant computations are useful when tiling stencil
      * computations.  In the context of stencils such a tiling is called
      * "overlapped tiling".  Tiles that depend on results computed by other
      * tiles that run in parallel can compute the boundaries redundantly which
      * allows them to avoid waiting and thus can run in parallel.
      *
      * In Tiramisu, the user can indicate that a chunk of a computation
      * should be computed redundantly. The original computation always has a redundancy
      * ID equal to 0 (which means this is the original computation).
      * The redundant chunk has an ID that is different from 0 and that is
      * used to uniquely identify it.
      *
      * For example if we want to compute all of c0 three times (that is,
      * compute the original computation and compute two redundant computations),
      * we can use the following schedules:
      *
      * The schedule of the original computation:      {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
      * The schedule of the redundant computation N°1: {c0[i,j]->c0[1,0,i,0,j,0]: 0<=i<N and 0<=j<N}
      * The schedule of the redundant computation N°2: {c0[i,j]->c0[2,0,i,0,j,0]: 0<=i<N and 0<=j<N}
      *
      * The schedule of c0 in this case would be three maps that map c0[i,j] to
      * the three different redundant computations in the time-processor domain:
      *
      * \code
      * {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N;
      *  c0[i,j]->c0[1,0,i,0,j,0]: 0<=i<N and 0<=j<N;
      *  c0[i,j]->c0[2,0,i,0,j,0]: 0<=i<N and 0<=j<N}
      * \endcode
      *
      * The function set_schedule() overrides any other schedule set by the high level
      * scheduling functions.  Currently the user has to choose between using the high
      * level scheduling functions or using this low level set_schedule function. The user
      * cannot mix the use of the two in the same program because they are not compatible.
      */
    // @{
    void set_schedule(isl_map *map);
    void set_schedule(std::string map_str);
    // @}

    /**
      * Collapse all the iterations of a loop into one single iteration.
      */
    void full_loop_level_collapse(int level, tiramisu::expr collapse_from_iter);
    /**
      * \overload
      */
    computation(std::string name,std::vector<var> iterator_variables, tiramisu::expr e, bool schedule_this_computation, primitive_t t);

    /**
      * \overload
      *
      * The argument \p predicate is used to add an affine condition
      * (predicate) around the computation. This argument should only
      * be used if the predicate is affine. For non affine predicates
      * use the .add_predicate() function.
      *
      * Example, if you have
      *
      * \code
      * for (i=0; i<20; i++)
      *   if (i != 7)
      *     S(i) = 0;
      * \endcode
      *
      * You can express this as follows
      *
      * \code
      * computation S("S", {i}, (i != 7), 0);
      * \endcode
      *
      */
    computation(std::string name, std::vector<tiramisu::var> iterator_variables, tiramisu::expr predicate, tiramisu::expr e, bool schedule_this_computation, primitive_t t, tiramisu::buffer cpu_buffer_to_map_to_device = tiramisu::buffer());

public:

    /**
      * \brief Constructor for computations.
      *
      * \details \p iteration_domain is a string that represents the iteration
      * domain of the computation.  The iteration domain should be written
      * in the ISL format (http://barvinok.gforge.inria.fr/barvinok.pdf Section 1.2.1).
      *
      * The iteration domain of a statement is a set that contains
      * all of the execution instances of the statement (a statement in a
      * loop has an execution instance for each loop iteration in which
      * it executes). Each execution instance of a statement in a loop
      * nest is uniquely represented by an identifier and a tuple of
      * integers  (typically, the values of the outer loop iterators).
      *
      * For example, the iteration space of the statement S0 in the following
      * loop nest
      *
      * \code
      * for (i=0; i<2; i++)
      *   for (j=0; j<3; j++)
      *      S0;
      * \endcode
      *
      * is {S0[0,0], S0[0,1], S0[0,2], S0[1,0], S0[1,1], S0[1,2]}
      *
      * S0[0,0] is the execution instance of S0 in the iteration [0,0].
      *
      * The previous set of integer tuples can be compactly described
      * by affine constraints as follows
      *
      * {S0[i,j]: 0<=i<2 and 0<=j<3}
      *
      * In general, the loop nest
      *
      * \code
      * for (i=0; i<N; i++)
      *   for (j=0; j<M; j++)
      *      S0;
      * \endcode
      *
      * has the following iteration domain
      *
      * {S0[i,j]: 0<=i<N and 0<=j<M}
      *
      * This should be read as: the set of points [i,j] such that
      * 0<=i<N and 0<=j<M.
      *
      * The name of the computation in the iteration domain should not
      * start with _ (an underscore).  Names starting with _ are reserved
      * names.
      *
      * \p e is the expression computed by the computation.
      * It is possible to declare the computation without specifying the expression.
      * The expression can be specified later using computation::set_expression().
      * An example of setting the expression after declaring the computation
      * is presented in tests/test_04.cpp.
      *
      * \p schedule_this_computation should be set to true if the computation
      * is supposed to be schedule and code is supposed to be generated from
      * the computation.  Set it to false if you just want to use the
      * computation to represent a buffer (that is passed as an argument
      * to the function) and you do not intend to generate code for the
      * computation.
      * An example where this argument is set to false is presented in
      * tests/test_14.cpp.
      *
      * \p t is the type of the computation, i.e. the type of the expression
      * computed by the computation. Example of types include (p_uint8,
      * p_uint16, p_uint32, ...).
      *
      * \p fct is a pointer to the Tiramisu function where this computation
      * should be added.
      *
      * Bound Inference:
      * The user can declare computations without providing any constraint
      * about the iteration domain, in this case he can rely on bound inference
      * to infer the constraints about each iteration domain. The user needs only
      * to provide constraints over the domains of the last computations (last
      * consumers), and Tiramisu will propagate these constraints to all the
      * chain of computations that precede those consumers.
      * Note that bound inference is not possible if you have multiple definitions
      * of the same computation. In such a case, you should provide constraints
      * over the iteration domain when you declare the computation.
      *
      * Examples about bound inference are provided in test_22 to test_25.
      */
    computation(std::string iteration_domain, tiramisu::expr e,
                bool schedule_this_computation, tiramisu::primitive_t t,
                tiramisu::function *fct);

    /**
      * \brief Constructor for computations.
      *
      * \details
      *
      * Same as \ref tiramisu::computation::computation(std::string name, std::vector<var> iterator_variables, tiramisu::expr e)
      * except that is has the following additional argument:
      *
      * \p schedule_this_computation indicates whether this computation should to
      * be scheduled.
      */
    computation(std::string name, std::vector<var> iterator_variables, tiramisu::expr e, bool schedule_this_computation);

    /**
      * \overload
      */
    computation(std::vector<var> iterator_variables, tiramisu::expr e);

    /**
      * \overload
      */
    computation(std::vector<var> iterator_variables, tiramisu::expr predicate, tiramisu::expr e);

    /**
      * \brief Constructor for computations.
      *
      * \details
      *
      * Declare a computation.  A computation in Tiramisu is the equivalent of a
      * a statement surrounded by a loop nest in C (an example is provided later).
      *
      * \p name is the name of the computation.
      *
      * \p iterator_variables is a vector that represents the loop iterators
      * around the computation.
      *
      * \p e is the expression computed by the computation.
      *
      * For example, if we have two iterator variables
      *
      * \code
      * var i("i", 0, 20), j("j", 0, 30);
      * \endcode
      *
      * and we have the following computation declaration
      *
      * \code
      * computation S("S", {i,j}, 4);
      * \endcode
      *
      * This is equivalent to writing the following C code
      *
      * \code
      * for (i=0; i<20; i++)
      *   for (j=0; j<30; j++)
      *      S(i,j) = 4;
      * \endcode
      *
      * More precisely, the vector {i, j} specifies the iteration domain of
      * the computation S0. In this case, 0<=i<20 and 0<=j<30.
      *
      * It is possible to declare the computation without specifying the expression.
      * The expression can be specified later using computation::set_expression().
      * An example of setting the expression after declaring the computation
      * is presented in tutorial 04.  Usually this is needed for writing
      * reductions.
      *
      */
    computation(std::string name, std::vector<var> iterator_variables, tiramisu::expr e);

    /**
      * \overload
      *
      * The argument \p predicate is used to add an affine condition
      * (predicate) around the computation. This argument should only
      * be used if the predicate is affine. For non affine predicates
      * use the .add_predicate() function.
      *
      * Example, if you have
      *
      * \code
      * for (i=0; i<20; i++)
      *   if (i != 7)
      *     S(i) = 0;
      * \endcode
      *
      * You can express this as follows
      *
      * \code
      * computation S("S", {i}, (i != 7), 0);
      * \endcode
      *
      */
    computation(std::string name, std::vector<var> iterator_variables, tiramisu::expr predicate, tiramisu::expr e);

    /**
      * \overload
      */
    computation(std::vector<var> iterator_variables, tiramisu::expr e, bool schedule_this_computation);

    /**
      * \overload
      *
      * \p t is the type of the computation, i.e. the type of the expression
      * computed by the computation. Example of types include (p_uint8,
      * p_uint16, p_uint32, ...).
      *
      * Usually, this constructor is used to declare buffer wrappers.
      *
      * For example, if we have two iterator variables
      *
      * \code
      * var i("i", 0, 20), j("j", 0, 30);
      * \endcode
      *
      * and we have the following computation declaration
      *
      * \code
      * computation S("S", {i,j}, p_uint8);
      * \endcode
      *
      * This can be used a wrapper on a buffer buf[20, 30] where the buffer elements
      * are of type uint8.
      */
    computation(std::string name, std::vector<var> iterator_variables, primitive_t t)
            : computation(name, iterator_variables, expr(t)) {}

    /**
      * \overload
      */
    computation(std::vector<var> iterator_variables, primitive_t t)
            : computation(iterator_variables, expr(t)) {}

    virtual bool is_send() const;

    virtual bool is_recv() const;

    virtual bool is_send_recv() const;

    virtual bool is_wait() const;

    /**
       * \brief Add a let statement that is associated to this computation.
       * \details The let statement will be executed before the computation
       * (more precisely, between this computation and any computation that
       * preceeds it). The variable defined by the let statement can be
       * accessible by this computation alone. i.e., it cannot be used
       * in any other computation.
       */
    void add_associated_let_stmt(std::string access_name, tiramisu::expr e);

    /**
     * Don't scheduled a previously scheduled computation
     */
    void unschedule_this_computation();

    /**
     * \brief Add definitions of computations that have the same name as this
     * computation.
     *
     * \details The arguments of this function are identical to the arguments of
     * the computation constructor.  In general, this function is used to express
     * reductions and to express computation updates.
     *
     * In other words, this function should be used if the user has already declared
     * a set of computations C and wants to declare more computations that have the
     * same name.
     *
     * Example: Let's assume we want to declare the following two computations.
     *
     * \code
     * // First computation
     * {C[i]: 0<=i<10}: 0
     *
     * // Second computation
     * {C[i]: 10<=i<20}: 1
     * \endcode
     *
     * To do this this, we can declare the first computation using the computation
     * constructor and declare the second computation using add_definitions().
     *
     * The use of add_computation is purely due to restrictions imposed by the C++ language
     * and not by the Tiramisu framework itself.  This is mainly because in C++, it is not
     * possible to declare two objects with the same name, for example one cannot do
     *
     * computation C(...);
     * computation C(...);
     *
     * In order to declare the second set of computations, we chose to use the add_definitions
     * function to avoid this problem.
     *
     * The newly added computations must have the same name and the same access function
     * as the initial set of computations but can have a different expression.
     *
     * An example of using this function is available in test_26.
     *
     */
    virtual void add_definitions(std::string iteration_domain_str, tiramisu::expr e,
                                 bool schedule_this_computation, tiramisu::primitive_t t,
                                 tiramisu::function *fct);

    /**
     * \brief Add a predicate (condition) on the computation. The computation will be executed
     * only if this condition is true.
     *
     * \details The predicate can be an affine or a non-affine expression.
     * If you need to put a condition around a block of statements (i.e., a sequence of computations),
     * then you can perform that by adding a predicate to each one of those computations.
     * The compiler will then transform automatically the multiple conditions (condition around each
     * computation) into one condition around the whole block.
     */
    void add_predicate(tiramisu::expr predicate);

    /**
      * \brief Schedule this computation to run after the computation \p comp.
      *
      * \details This computation is placed after \p comp in the loop level \p level.
      * \p level is a loop level in this computation.
      *
      * The root level is computation::root.  The global variable
      * computation::root is equivalent to var("root").
      *
      * For example assuming we have the two computations
      *
      *     {S0[i,j]: 0<=i<N and 0<=j<N} and {S1[i,j]: 0<=i<N and 0<=j<N}
      *
      * In order to make S1 run after S0 in the i loop, one should use
      *
      *     S1.after(S0, i)
      *
      * which means: S1 is after S0 at the loop level i (which is loop level 0).
      *
      * The corresponding code is
      *
      * \code
      *     for (i=0; i<N; i++)
      *     {
      *         for (j=0; j<N; j++)
      *             S0;
      *         for (j=0; j<N; j++)
      *             S1;
      *     }
      * \endcode
      *
      * S1.after(S0, j)
      *
      * means: S1 is after S0 at the loop level j (which is 1) and would yield
      * the following code
      *
      * \code
      * for (i=0; i<N; i++)
      *   for (j=0; j<N; j++)
      *   {
      *     S0;
      *     S1;
      *   }
      * \endcode
      *
      * S1.after(S0, computation::root)
      * means S1 is after S0 at the main program level and would yield
      * the following code
      *
      * \code
      * for (i=0; i<N; i++)
      *   for (j=0; j<N; j++)
      *     S0;
      * for (i=0; i<N; i++)
      *   for (j=0; j<N; j++)
      *     S1;
      * \endcode
      *
      * Note that as with all other scheduling methods:
      *     - Calling this method with the same computations overwrites the level if it is
      *     higher.
      *     - A computation being scheduled after another computation at level L means it is
      *     scheduled after that computation at all levels lower than L.
      *     - There should be exactly one computation with no computation scheduled before it.
      *     - Each other computation should have exactly one computation scheduled before it.
      *
      */
    void after(computation &comp, tiramisu::var iterator);

    /**
      * This function is equivalent to
      *     void after(computation &comp, tiramisu::var iterator);
      * except that it uses loop level numbers (0, 1, 2, ...) instead of using loop variables
      * (tiramisu::var).  Tiramisu internally represent loop levels using numbers instead
      * of variable names, and this is the actual function used internally.
      *
      * The outermost loop level is 0.  The root level is computation::root_dimension.
      *
      * For example assuming we have the two computations
      *
      *     {S0[i,j]: 0<=i<N and 0<=j<N} and {S1[i,j]: 0<=i<N and 0<=j<N}
      *
      * In order to make S1 run after S0 in the i loop, one should use
      *
      *     S1.after(S0,0)
      *
      * which means: S1 is after S0 at the loop level 0 (which is i).
      *
      * The corresponding code is
      *
      * \code
      *     for (i=0; i<N; i++)
      *     {
      *         for (j=0; j<N; j++)
      *             S0;
      *         for (j=0; j<N; j++)
      *             S1;
      *     }
      * \endcode
      */
    void after(computation &comp, int level);

    /**
      * Schedule this computation to run after the computation \p comp.
      * The computations are placed after each other in the loop level \p level.
      * The outermost loop level is 0.  The root level is computation::root_dimension.
      *
      * For example assuming we have the two computations
      *
      *     {S0[i,j]: 0<=i<N and 0<=j<N} and {S1[i,j]: 0<=i<N and 0<=j<N}
      *
      * In order to make S1 run after S0 in the i loop, one should use
      *
      *     S1.after_low_level(S0,0)
      *
      * which means: S1 is after S0 at the loop level 0 (which is i).
      *
      * The corresponding code is
      *
      * \code
      *     for (i=0; i<N; i++)
      *     {
      *         for (j=0; j<N; j++)
      *             S0;
      *         for (j=0; j<N; j++)
      *             S1;
      *     }
      * \endcode
      *
      * S1.after_low_level(S0,1)
      *
      * means: S1 is after S0 at the loop level 1 (which is j) and would yield
      * the following code
      *
      * \code
      * for (i=0; i<N; i++)
      *   for (j=0; j<N; j++)
      *   {
      *     S0;
      *     S1;
      *   }
      * \endcode
      *
      * S1.after_low_level(S0, computation::root_dimension)
      * means S1 is after S0 at the main program level and would yield
      * the following code
      *
      * \code
      * for (i=0; i<N; i++)
      *   for (j=0; j<N; j++)
      *     S0;
      * for (i=0; i<N; i++)
      *   for (j=0; j<N; j++)
      *     S1;
      * \endcode
      *
      * To specify that this computation is after \p comp in multiple levels,
      * the user can provide those levels in the \p levels vector.
      *
      * S1.after_low_level(S0, {0,1})
      *
      * means that S1 is after S0 in the loop level 0 and in the loop level 1.
      *
      * Note that
      *
      * S1.after_low_level(S0, L)
      *
      * would mean that S1 and S0 share the same loop nests for all the loop
      * levels that are before L and that S1 is after S0 in L only.  S1 is not
      * after S0 in the loop levels that are before L.
      *
      */
    // @{
    void after_low_level(computation &comp, int level);
    void after_low_level(computation &comp, std::vector<int> levels);
    // @}
    /*
     * \brief Allocate a buffer for the computation automatically.  The size of the buffer
     * is deduced automatically and a name is assigned to it automatically.
     *
     * \details Assuming the name of the computation is C, the name of the generated buffer
     * is _C_buffer.
     *
     * \p type is the type of the argument. Three possible types exist:
     *  - a_input: for inputs of the function,
     *  - a_output: for outputs of the function,
     *  - a_temporary: for buffers used as temporary buffers within
     *  the function (any temporary buffer is allocated automatically by
     *  the Tiramisu runtime at the entry of the function and is
     *  deallocated at the exit of the function).
     */
    void allocate_and_map_buffer_automatically(tiramisu::argument_t type = tiramisu::a_temporary);

    /**
      * \brief Apply a transformation on the schedule.
      *
      * \details This transformation is from
      * the time-space domain to the time-space domain.  It is applied on
      * the range of the schedule (i.e., on the output of the schedule relation).
      *
      * For example, to shift the i dimension of the time-processor domain
      * of C0, you can apply the transformation
      *
      * C0[0, 0, i, 0, j, 0] -> C0[0, 0, i+2, 0, j, 0]
      *
      * To apply an interchange, you would do
      *
      * C0[0, 0, i, 0, j, 0] -> C0[0, 0, j, 0, i, 0]
      */
    void apply_transformation_on_schedule(std::string map_str);

    /**
      * \brief Automatically allocate and return a buffer for this computation.
      *
      * \details This method is meant to be used to create arguments for function::codegen.
      * See test 141 for an example.
      */
    buffer* auto_buffer();

    /**
      * \brief Schedule this computation to run before the computation \p consumer
      * at the loop level \p L.
      *
      * \details Notes
      *     - The loop level \p L is a loop level of this computation.
      *     - Use computation::root to indicate the root dimension
      *       (i.e. the outermost time-space dimension).
      *     - Calling this method with the same computations overwrites the level if it is
      *     higher.
      *     - A computation being scheduled after another computation at level L means it is
      *     scheduled after that computation at all levels lower than L.
      *     - There should be exactly one computation with no computation scheduled before it.
      *     - Each other computation should have exactly one computation scheduled before it.
      */
    // @{
    void before(computation &consumer, tiramisu::var L);
    // @}

    /**
      * Schedule this computation to run after \p before_comp at the loop level \p before_l,
      * and before \p after_comp at loop level \p after_l. The outermost loop level is 0.
      *
      * Use computation::root_dimension to indicate the root dimension
      * (i.e. the outermost time-space dimension).
      *
      * If there was already a direct scheduling between \p before_comp and \p after_comp
      * (e.g. using before, after, between...), that schedule is overwritten; i.e. it no longer
      * exists/has an effect.
      *
      * Note that as with all other scheduling methods:
      *     - Calling this method with the same computations overwrites the levels if they are
      *     higher.
      *     - A computation being scheduled after another computation at level L means it is
      *     scheduled after that computation at all levels lower than L.
      *     - There should be exactly one computation with no computation scheduled before it.
      *     - Each other computation should have exactly one computation scheduled before it.
      */
    void between(computation &before_comp, tiramisu::var before_l, computation &after_comp, tiramisu::var after_l);

    /**
      * This function is equivalent to void between(computation &before_comp, tiramisu::var before_l,
      * computation &after_comp, tiramisu::var after_l); except that it uses loop level numbers
      * (0, 1, 2, ...) instead of using loop variables (tiramisu::var).
      */
    void between(computation &before_comp, int before_l, computation &after_comp, int after_l);

    /**
       * \brief Store this computation in \p buff
       *
       * \details
       *
       * Let us assume that we have a computation C:
       *
       * \code
       * {C[i]: 0<=i<N}
       * \endcode
       *
       * and that we want to store each C(i) in bufC[i]. Then we
       * can use store_in() to indicate that as follows:
       *
       * \code
       * C.store_in(&bufC)
       * \endcode
       *
       * This means that each computation C(i) will be stored
       * in the buffer location bufC[i].
       *
       * If \p iterators is specified, the \p iterators are used to specify how the
       * computation is mapped to the buffer.
       * If the dimensions of this computation are in0, in1, ..., inn and if
       * \p iterators are equal to im0, im1, ..., imm then the computation is
       * mapped as follows
       *
       * \code
       * C[in0, in1, ..., inn]->bufC[im0, im1, ..., imm].
       * \endcode
       *
       * i.e., the computation C[in0, in1, ..., inn] is stored in bufC[im0,
       * im1, ..., imm].
       *
       * This can be used to store the data in many ways (reordering the
       * storage, storing into modulo buffers, ...).
       *
       * Assuming we have have computation D(i,j) that has the following
       * iteration domain:
       *
       * \code
       * {D[i,j]: 0<=i<N and 0<=j<N}
       * \endcode
       *
       * and assuming we have a buffer bufD.
       *
       * The store_in() function can be used to implement many types of data mappings:
       *    - Store the computation D to a scalar: D.store_in(&bufD, {}).
       *      This mans that D(i) will be stored in bufD[0] (which represents a
       *      scalar).
       *    - Store a 2 dimensional computation into a 1-dimensional
       *    buffer: D.store_in(&bufD, {i});
       *    - Change the order of storage.
       *    D.store_in(&bufD, {j, i}) will store D(i,j) in bufD(j,i).
       *    - Store the computation in a circular buffer (modulo storage).
       *    D.store_in(&bufD, {i%4, j%4});
       *    This will store D(i,j) in bufD[i%4, j%4].  Assuming the buffer
       *    bufD is a 4x4 buffer.
       */
     // @{
     void store_in(buffer *buff);
     void store_in(buffer *buff, std::vector<expr> iterators);
     // }@

    /**
      * \brief Resize the implicit buffer and remap the computation.
      *
      * \details By default Tiramisu computations are stored in implicit
      * buffers with one to one mapping. If user wants to store the data in a
      * different layout (i.e. for reductions), this function should be used to
      * change the size of the implicit buffer and mapping.
      *
      * For example a simple reduction operation would look like this:
      *
      * \code
      * computation C({i, j}, p_float);
      * C.set_expression(C(i, 0) + B(i, j));
      * C.store_in({i}, {N});
      * \endcode
      *
      * Note that this function creates a completely new implicit buffer, thus
      * the old implicit buffer should not be referenced before this operation.
      */
    // @{
    void store_in(std::vector<expr> mapping, std::vector<expr> sizes);
    // }@

    /**
     * Utilize shared memory layer when accessing the input computation.
     *
     * Whenever the threads of the same block are accessing the same global
     * memory location (or same cache line), copying values to shared memory
     * first and accessing values from there gives significant performance
     * benefits.
     *
     * This is a helper function to generate necessary buffers and
     * copy computations to achieve this indirection. The function is
     * half-manual as user needs to provide the area that needs to be copied
     * and ensure correctness by making sure the access area under the
     * \p level is covered by the shared memory cache.
     *
     * \p level is the level after which the accesses will be cached.
     *
     * \p buffer_shape is the shape of the shared memory buffer to be used.
     * It should be able fit device shared memory (usually 48KB total).
     * It should have the same dimensionality as the input computation.
     *
     * \p copy_offsets is the offset of the values that should be copied
     * from input computation at iteration of each \p level.
     *
     * If \p pad_buffer is true, the innermost dimension of shared memory buffer
     * is padded by 1 which might help reduce the shared memory bank conflicts.
     *
     * Returns the new access computation for input.
     *
     * An example use case for GEMM:
     *
     * \code
     * computation C({i, j, k}, C(i, j) + A(i, k) * B(k, j));
     * C.gpu_tile(i, j, 16, 16, i0, j0, i1, j1);
     * C.split(k, 32, k0, k1);
     * C.cache_shared(A, k0, {16, 32}, {i0 * 16, k0 * 32});
     * C.cache_shared(B, k0, {32, 16}, {k0 * 32, j0 * 16});
     * \endcode
     *
     */
    computation *cache_shared(computation &inp, const var &level,
                      const std::vector<int> buffer_shape,
                      const std::vector<expr> copy_offsets,
                      bool pad_buffer=false);

    /**
      * This function assumes that \p consumer consumes values produced by
      * this computation (which is the producer).
      *
      * Compute this computation as needed for each unique value of the
      * \p consumer.
      *
      * This computation is scheduled so that the values consumed by the
      * \p consumer are computed at the level \p L and in the same loop
      * nest of the consumer. \p L is a loop level in the consumer.
      *
      * If the consumer needs this computation to be computed redundantly,
      * the function creates the necessary redundant computations and schedules
      * them before the consumer.
      *
      * This function performs the following:
      *     - schedules this computation to be executed as needed before
      *     the consumer.
      *     - if this computation needs to be computed redundantly, redundant
      *     computations are create.
      *
      * This function does not:
      *     - create any data mapping to this computation. It is up to the
      *     user to provide an access relation to this computation as he
      *     would do to any other normal computation.
      *     - it does not allocate any buffer to this computation. It is
      *     up to the user to declare a buffer where the results of this
      *     computation will be stored.
      *
      * If this functions creates a duplicate of the computation, the user
      * does not need to set its access relation.  The duplicated computation
      * will automatically have the same access relation as the original
      * computation. This access relation is set automatically.
      *
      * This function does not return a handler to manipulate the duplicate
      * computation. It does not allow the user to manipulate the duplicate
      * freely.  The duplicate is scheduled automatically to be executed
      * before the consumer.
      */
    void compute_at(computation &consumer, tiramisu::var L);
    void compute_at(computation &consumer, int L);

    /**
      * Generates the time-space domain and construct an AST that scans that
      * time-space domain, then compute the depth of this AST.
      * This is useful for example to know if all the dimensions of the time-space
      * domain will correspond to a loop level in the final generated AST.
      */
    int compute_maximal_AST_depth();

    /**
      * Dump the iteration domain of the computation.  This is useful for
      * debugging.
      */
    void dump_iteration_domain() const;

    /**
      * Dump the schedule of the computation.  This is mainly useful for
      * debugging.
      *
      * The schedule is a relation between the iteration space and the
      * time space.  The relation provides a logical date of execution for
      * each point in the iteration space.
      * The schedule needs first to be set before calling this function.
      */
    void dump_schedule() const;

    /**
      * Dump the computation on stdout.  This is mainly useful for
      * debugging.
      */
    void dump() const;

    /**
      * Fuse this computation with the computation passed as argument
      * in the same loop.  Run this computation after that computation.
      * Fuse them at the loop level \p lev.
      *
      * For example, assuming we have the following computations
      *
      * {S0[i,j]: 0<=i<N and 0<=j<N}, {S1[i,j]: 0<=i<N and 0<=j<N}
      * and {S2[i,j]: 0<=i<N and 0<=j<N}.
      *
      * Without fusion, these computations would be equivalent
      * to the following loops nests
      *
      * \code
      * for (i=0; i<N; i++)
      *   for (j=0; j<N; j++)
      *     S0;
      *
      * for (i=0; i<N; i++)
      *   for (j=0; j<N; j++)
      *     S1;
      *
      * for (i=0; i<N; i++)
      *   for (j=0; j<N; j++)
      *     S2;
      * \endcode
      *
      * To fuse them, one should call
      *
      * \code
      * S2.fuse_after(j, S1);
      * S1.fuse_after(j, S0);
      * \endcode
      *
      * This would result in fusing S2 with S0 and S1 at loop level j.
      * S2 will be scheduled for execution after S0 and S1.  The resulting code would look like
      *
      * \code
      * for (i=0; i<N; i++)
      *   for (j=0; j<N; j++)
      *   {
      *     S0;
      *     S1;
      *     S2;
      *   }
      * \endcode
      *
      * Calling
      *
      * \code
      * S2.fuse_after(i, S1);
      * S1.fuse_after(i, S0);
      * \endcode
      *
      * would result in the following code
      *
      * \code
      * for (i=0; i<N; i++)
      * {
      *   for (j=0; j<N; j++)
      *     S0;
      *   for (j=0; j<N; j++)
      *     S1;
      *   for (j=0; j<N; j++)
      *     S2;
      * }
      * \endcode
      *
      */
    void fuse_after(tiramisu::var lev, computation &comp)
    {
        assert(lev.get_name().size() > 0);

        this->after(comp, lev);
    }

    /**
      * Generate the time-space domain of the computation.
      *
      * In this representation, the logical time of execution and the
      * processor where the computation will be executed are both
      * specified.  The memory location where computations will be
      * stored in memory is not specified at the level.
      */
    void gen_time_space_domain();

    /**
      * Specify that the rank loop iterator should be removed from linearization.
      */
    void drop_rank_iter(var level);

    /**
      * Get the buffer that is used to store this computation. Buffer is
      * determined using the access function.
      */
    buffer *get_buffer() const;

    /**
      * Get the data type of the computation.
      */
    primitive_t get_data_type() const;

    /**
      * Return the Tiramisu expression associated with the computation.
      */
    const tiramisu::expr &get_expr() const;

    /**
     * Return the iteration domain of the computation.
     * In this representation, the order of execution of computations
     * is not specified, the computations are also not mapped to memory.
     */
    isl_set *get_iteration_domain() const;

    /**
      * Get the last update of a computation.
      */
    computation &get_last_update();

    /**
      * Search the time-space domain (the range of the schedule) and
      * return the loop level number that correspond to the dimension
      * named \p dim.
      * In other words, translate the vector of dimension name (\p dim_name)
      * into a loop level number.
      */
    int get_loop_level_number_from_dimension_name(std::string dim_name)
    {
        return this->get_loop_level_numbers_from_dimension_names({dim_name})[0];
    }

    /**
      * Return the name of the computation.
      */
    const std::string &get_name() const;

    /**
      * Returns a pointer to the computation scheduled immediately before this computation,
      * or a null pointer if none exist.
      */
    computation *get_predecessor();

    /**
      * Returns a pointer to the computation scheduled immediately after this computation,
      * or a null pointer if none exist.
      */
    computation *get_successor();

    /**
      * Returns the \p index update that has been added to this computation such that:
      * - If \p index == 0, then this computation is returned.
      * - If \p > 0, then it returns the pth computation added through add_definitions.
      */
    computation& get_update(int index);

    /**
     * Get the schedule of the computation.
     */
    isl_map *get_schedule() const;

    /**
      * Tile the computation and then tag the outermost tile dimension
      * to be mapped to GPU blocks and tag the innermost tile dimensions
      * to be mapped to GPU threads.
      *
      * Tile the two loop levels \p L0 and \p L1 with rectangular
      * tiling. \p sizeX and \p sizeY represent the tile size.
      * \p L0 and \p L1 should be two consecutive loop levels
      * (i.e., \p L0 = \p L1 + 1) and they should satisfy
      * \p L0 > \p L1.
      *
      * \p L0_outer, \p L1_outer, \p L0_inner, \p L1_inner
      * are the names of the new dimensions created after tiling.
      */
    // @{
    virtual void gpu_tile(var L0, var L1, int sizeX, int sizeY);
    virtual void gpu_tile(var L0, var L1, int sizeX, int sizeY,
                          var L0_outer, var L1_outer,
                          var L0_inner, var L1_inner);
    virtual void gpu_tile(var L0, var L1, var L2, int sizeX, int sizeY, int sizeZ);
    virtual void gpu_tile(var L0, var L1, var L2, int sizeX, int sizeY, int sizeZ,
                          var L0_outer, var L1_outer, var L2_outer,
                          var L0_inner, var L1_inner, var L2_inner);
    // @}

    /**
      * Return the buffer that was allocated automatically using
      * high level data mapping functions.
      * If no automatic buffer was allocated, this function returns NULL.
      */
    buffer *get_automatically_allocated_buffer();

    /**
      * Interchange (swap) the two loop levels \p L0 and \p L1.
      */
    virtual void interchange(var L0, var L1);

    /**
      * Identical to
      *     void interchange(var L0, var L1);
      */
    virtual void interchange(int L0, int L1);

    /**
      * Mark this statement as a let statement.
      */
    void mark_as_let_statement();

    /**
      * Mark this statement as a library call.
      */
    void mark_as_library_call();

    /**
      * Tag the loop level \p L to be parallelized.
      *
      * This function is equivalent to the function \ref tiramisu::computation::tag_parallel_level() .
      * There is no difference between the two.
      *
      */
    virtual void parallelize(var L);

    /**
       * Set the access relation of the computation.
       *
       * The access relation is a relation from computations to buffer locations.
       * \p access_str is a string that represents the relation. It is encoded
       * in the ISL format, http://isl.gforge.inria.fr/user.html#Sets-and-Relations
       * of relations.
       *
       * Note that, in TIramisu, the access relations of computation that have the same name
       * must be identical.
       *
       * Examples: tutorial_01, tutorial_02, tutorial_08 (actually most tutorials have set_access()).
       */
     // @{
     void set_access(std::string access_str);
     void set_access(isl_map *access);
     // @}

    void set_wait_access(std::string access_str);
    void set_wait_access(isl_map *access);

     /**
       * Set the expression of the computation.
       */
     void set_expression(const tiramisu::expr &e);

     /**
       * Sets whether the computation is inline or not, based on the value of \p is_inline.
       * If a computation is inline, accesses to the computation return the expression of that
       * computation.
       * E.g. if an inline computation S(i,j) is defined with the expression i + j,
       * then S(i + 1, j * i) returns the expression i + 1 + j * i.
       * If \p is_inline is not provided, the computation is set to be inline.
       */
     void set_inline(bool is_inline = true);

     /**
       * Returns true if and only if the computation is inline.
       */
    const bool is_inline_computation() const;

     /**
       * Set the schedule indicated by \p map.
       *
       * \p map is a string that represents a mapping from the iteration domain
       * to the time-space domain (the ISL format to represent maps is
       * documented in http://barvinok.gforge.inria.fr/barvinok.pdf in Sec 1.2.2).
       *
       * The schedule is a map from the iteration domain to a time space
       * domain. The same name of space should be used for both the range
       * and the domain of the schedule.
       *
       * In general, users can set the schedule using high level functions such
       * as before(), after(), tile(), compute_at(), vectorize(), split(), ...
       * The use of this function is only reserved for advanced users who want
       * a low level control of the schedule.
       *
       * Vectors in the time-space domain have the following form
       *
       * computation_name[redundancy_ID,static,dynamic,static,dynamic,static,dynamic,static,...]
       *
       * The first dimension of the vector is used to indicate the redundancy ID
       * (the notion of the redundancy ID is explained later).
       *
       * The following dimensions are interleaved dimensions: static, dynamic, static,
       * dynamic, ...
       * Dynamic dimensions represent the loop levels, while static dimensions are
       * used to order statements within a given block of statements in a given loop
       * level.
       * For example, the computations c0 and c1 in the following loop nest
       *
       * \code
       * for (i=0; i<N: i++)
       *   for (j=0; j<N; j++)
       *   {
       *     c0;
       *     c1;
       *   }
       * \endcode
       *
       * have the following representations in the iteration domain
       *
       * \code
       * {c0[i,j]: 0<=i<N and 0<=j<N}
       * {c1[i,j]: 0<=i<N and 0<=j<N}
       * \endcode
       *
       * and the following representation in the time-space domain
       *
       * \code
       * {c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
       * {c1[0,0,i,0,j,1]: 0<=i<N and 0<=j<N}
       * \endcode
       *
       * The first dimension (dimension 0) in the time-space
       * domain (the leftmost dimension) is the redundancy ID
       * (in this case it is 0, the meaning of this ID will be explained later).
       * The second dimension (starting from the left) is a static dimension,
       * the third dimension is a dynamic dimension that
       * represents the loop level i, ..., the fifth dimension is a dynamic
       * dimension that represents the loop level j and the last dimension
       * (dimension 5) is a static dimension and allows the ordering of
       * c1 after c0 in the loop nest.
       *
       * To transform the previous iteration domain to the
       * time-space domain, the following schedule should be used:
       *
       * \code
       * {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
       * {c1[i,j]->c1[0,0,i,0,j,1]: 0<=i<N and 0<=j<N}
       * \endcode
       *
       * The first dimension called "redundancy ID" is only meaningful if the
       * computation is redundant. i.e., some parts of the computation are
       * redundantly computed.  Redundant computations are in general used to
       * maximize parallelism and data locality on the expense of doing some
       * computations redundantly.
       * For example, if the two computations c1(i,j) and c2(i,j) both depend
       * on the computation c0(i,j), instead of waiting for c0(i,j) to be
       * computed and then computing c1(i,j) and c2(i,j) in parallel, the thread
       * executing c1(i,j) can compute c0(i,j) by itself and then run c1(i,j).
       * The thread that computes c2(i,j) can do the same and compute c0(i,j)
       * by itself and then compute c2(i,j). In this case the two threads do not
       * need to wait. This is done at the expense of redundant computation since
       * c0(i,j) is computed by both threads.
       *
       * In general redundant computations are useful when tiling stencil
       * computations.  In the context of stencils such a tiling is called
       * "overlapped tiling".  Tiles that depend on results computed by other
       * tiles that run in parallel can compute the boundaries redundantly which
       * allows them to avoid waiting and thus can run in parallel.
       *
       * In Tiramisu, the user can indicate that a chunk of a computation
       * should be computed redundantly. The original computation always has a redundancy
       * ID equal to 0 (which means this is the original computation).
       * The redundant chunk has an ID that is different from 0 and that is
       * used to uniquely identify it.
       *
       * For example if we want to compute all of c0 three times (that is,
       * compute the original computation and compute two redundant computations),
       * we can use the following schedules:
       *
       * The schedule of the original computation:      {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
       * The schedule of the redundant computation N°1: {c0[i,j]->c0[1,0,i,0,j,0]: 0<=i<N and 0<=j<N}
       * The schedule of the redundant computation N°2: {c0[i,j]->c0[2,0,i,0,j,0]: 0<=i<N and 0<=j<N}
       *
       * The schedule of c0 in this case would be three maps that map c0[i,j] to
       * the three different redundant computations in the time-processor domain:
       *
       * \code
       * {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N;
       *  c0[i,j]->c0[1,0,i,0,j,0]: 0<=i<N and 0<=j<N;
       *  c0[i,j]->c0[2,0,i,0,j,0]: 0<=i<N and 0<=j<N}
       * \endcode
       *
       * The function set_schedule() overrides any other schedule set by the high level
       * scheduling functions.  Currently the user has to choose between using the high
       * level scheduling functions or using this low level set_schedule function. The user
       * cannot mix the use of the two in the same program because they are not compatible.
       */
     // @{
     void set_low_level_schedule(isl_map *map);
     void set_low_level_schedule(std::string map_str);
     // @}

    /**
      * Shift the loop level \p L0 of the iteration space by
      * \p n iterations.
      *
      * \p n can be a positive or a negative number. A positive
      * number means a shift forward of the loop iterations while
      * a negative value would mean a shift backward.
      */
    virtual void shift(var L0, int n);
    

    /*
      * Apply loop skewing on the loop levels \p i and \p j with a skewing factor of \p f.
      * The names of the new loop levels is \p ni and \p nj.
      *
      * This command transforms the loop (i, j) into the loop (i, f*i+j).
      * 
      * This method is requires the application of loop interchange afterwards to really change the execution order
      * This is legacy code that should not be used on new projects
      * For example if you have the following loop
      *
      * \code
      for (int i = 1; i < N; i++)
        for (int j = 1; j < M; j++)
          a[i][j] = a[i - 1][j] + a[i][j - 1];
       \endcode

      * and apply

      \code
        a.skew(i, j, 1, ni, nj);
      \endcode

      * you would get
      *
      \code
      for (int i = 1; i < N; i++)
        for (int j = i+1; j < i+M; j++)
          a[i][j - i] = a[i - 1][j - i] + a[i][j - i - 1];
      \endcode

      * Apply loop skewing on the loop levels \p i, \p j and \p k with a skewing factor of \p f.
      * The names of the new loop levels is \p ni, \p nj and \p nk.
      * This command transforms the loop (i, j, k) into the loop (i, f*i+j, f*i+k).
      * 
      * Apply loop skewing on the loop levels \p i, \p j, \p k, \p l with a skewing factor of \p f.
      * The names of the new loop levels is \p ni, \p nj, \p nk and \p nl.
      * This command transforms the loop (i, j, k, l) into the loop (i, f*i+j, f*i+k, f*i+l).

    // @{
    virtual void skew(var i, var j, int f, var ni, var nj);

    virtual void skew(var i, var j, var k, int factor,
                      var ni, var nj, var nk);
   
    virtual void skew(var i, var j, var k, var l, int factor,
                      var ni, var nj, var nk, var nl);

    virtual void skew(var i, var j, int factor);

    virtual void skew(var i, var j, var k, int factor);

    virtual void skew(var i, var j, var k, var l, int factor);

    virtual void skew(int i, int j, int factor);

    virtual void skew(int i, int j, int k, int factor);

    virtual void skew(int i, int j, int k, int l, int factor);
    // @}
    */
    
    /**
      * Apply skewing on the loop levels \p i and \p j and replace them with the new loop levels i' and j' such that:
      * i' = a*i +b*j 
      * j' is calculated automatically to guarantee that its increment is always 1.

      * additional constraints for inputs are :  b!=0 & a >=0
      *           a & b must be prime between themselfs
      * This command transforms the loop (i, j) into the loop (a*i+b*j,j').
      * For example if you have the following loop
      *
      * \code
      for (int i = 1; i < N; i++)
        for (int j = 1; j < M; j++)
          a[i][j] = a[i - 1][j] + a[i][j - 1];
       \endcode

      * and apply

      \code
        a.skew(i, j, 1, 1, ni, nj);
      \endcode

      * you would get
      *
      \code
      for (int i = 1; i < 2N; i++)
        for (int j = max(i,N-i); j < Min(i,N) ; j++)
          a[i-j][j] = a[i - j - 1][j] + a[i-j][j - 1];
      \endcode
      */
    // @{
    virtual void skew(var i, var j, int a , int b, var ni, var nj);

    virtual void skew(int i, int j, int a, int b); 
    // @}

        /**
      * Apply skewing on the loop levels \p i and \p j and replace them with the new loop levels i' and j' such that:
      * i' = a*i +b*j 
      * j' = gamma*i + sigma*j.
      *  |a*sigma - gamma*b| = 1
      * additional constraints for inputs are :  b!=0 & a >=0
      *           a & b must be prime between themselves
      * This command transforms the loop (i, j) into the loop (a*i+b*j,j').
      * For example if you have the following loop
      *
      * \code
      for (int i = 1; i < N; i++)
        for (int j = 1; j < M; j++)
          a[i][j] = a[i - 1][j] + a[i][j - 1];
       \endcode

      * and apply

      \code
        a.skew(i, j, 1, 1, 0, 1, ni, nj);
      \endcode

      * you would get
      *
      \code
      for (int i = 1; i < 2N; i++)
        for (int j = max(i,N-i); j < Min(i,N) ; j++)
          a[i-j][j] = a[i - j - 1][j] + a[i-j][j - 1];
      \endcode
      */
    // @{
    virtual void skew(var i, var j, int alpha , int beta, int gamma , int sigma, var ni, var nj);

    virtual void skew(int i, int j, int alpha , int beta, int gamma , int sigma); 
    // @}

    /**
      * applied to a computation's loop level i : it inverts the execution order for this specific loop
      * i.e : original i : 0 -> n to :
      * reversed i : n -> 0
      * This command transforms the loop (i) into the loop (-i)  
    */
   // @{
    virtual void loop_reversal(var old_var, var new_var);

    
    virtual void loop_reversal(int i);
    // @}


    /**
     * Checks the correctness of a subset of dependencies after applying changes on the schedules (e.g., tiling, skewing, and shifting).
     * The checked subset of dependencies is the set of dependencies mapping from this computation (this) to second computation (second).
     * This methods returns a boolean: True if this subset of dependencies is respected, otherwise False.
     * It relies fully on the dependence analysis result, so the  method \p perform_full_dependency_analysis() must be invoked before.
     * To correctly invoke this method : schedules must be aligned (same out dimension size) and ordered,
     * so invoking \p prepare_schedules_for_legality_checks() method before is mandatory. 
    */
    virtual bool involved_subset_of_dependencies_is_legal(tiramisu::computation * second) ;

     
    /**
     * Checks if it's possible to unroll the loop level \p l for this computation.
     * This is True when the bounds of the loop are constants. 
     **/
    virtual bool unrolling_is_legal(var l) ;

    /**
      * Split the loop level \p L0 of the iteration space into two
      * new loop levels.
      *
      * \p sizeX is the extent (size) of the inner loop created after
      * splitting.
      */
    //@{
    virtual void split(var L0, int sizeX);
    virtual void split(var L0, int sizeX,
                       var L0_outer, var L0_inner);
    //@}

    /**
      * Identical to
      *     void split(var L0, int sizeX);
      */
    virtual void split(int L0, int sizeX);

     /**
      * Split the loop level \p L0 of the iteration space into two
      * new loop levels.
      *
      * \p sizeX is the extent (size) of the inner loop created after
      * splitting.
      * Takes into consideration the lower bound so that max() and min() would not appear if possible(implicit shifting).
      */
    virtual void split_with_lower_bound(int L0, int sizeX, std::string lower_bound);

    /**
     * Fold the storage of the computation.
     * Fold the loop level \p dim by a factor \p f.
     */
    virtual void storage_fold(var dim, int f);

    /**
     * Allocate the storage of this computation in the loop level \p L0.
     *
     * This function does the following:
     *  - computes the size of the buffer needed to store this computation
     *  (TODO: current the size computed by Tiramisu is equal to the size
     *  of the computation, Tiramisu does not allocate smaller buffers if
     *  such a thing is possible, this is left for future work).
     *  - allocates a temporary buffer with the appropriate size,
     *  - schedules the allocation operation to be executed in the loop
     *  nest where \p comp is computated at the loop level \p L0.
     *
     * The function returns the computation (operation) that allocates
     * the buffer.  The allocated buffer is not returned.
     */
    computation *store_at(computation &comp, var L0);

    /**
      * Tag the loop level \p L0 and \p L1 to be mapped to GPU.
      */
    // @{
    void tag_gpu_level(tiramisu::var L0, tiramisu::var L1);
    void tag_gpu_level(tiramisu::var L0, tiramisu::var L1, tiramisu::var L2, tiramisu::var L3);
    void tag_gpu_level(tiramisu::var L0, tiramisu::var L1, tiramisu::var L2, tiramisu::var L3, tiramisu::var L4, tiramisu::var L5);
    // @}

    /**
      * Tag the loop level \p L to be parallelized.
      */
    void tag_parallel_level(tiramisu::var L);

    /**
      * Identical to
      *    void tag_parallel_level(int L);
      */
    void tag_parallel_level(int L);

    /**
      * Tag the loop level \p L to be vectorized.
      * \p len is the vector length.
      *
      * The user can only tag loop levels that have constant extent.
      * If a loop level does not have a constant extent, the user
      * should call .vectorize() command instead or he can call
      * separate() and split() manually.
      *
      * The user has to make sure that the extent of the dimension
      * is bigger than \p len. The vectorization of a loop that has
      * less than \p len iterations is not correct.
      *
      */
    void tag_vector_level(tiramisu::var L, int len);

    /**
      * Identical to
      *     void tag_vector_level(tiramisu::var L, int len);
      */
    void tag_vector_level(int L, int len);

    /**
      * Tag the loop level \p L to be distributed.
      */
    // @{
    void tag_distribute_level(tiramisu::var L);
    void tag_distribute_level(int L);
    // @}

    /**
      * Tag the loop level \p L to be unrolled.
      *
      * The user can only tag loop levels that have constant extent.
      */
    void tag_unroll_level(tiramisu::var L);

    /**
      * Tag the loop level \p L to be unrolled with an unrolling
      * factor \p F.
      *
      * The user can only tag loop levels that have constant extent
      * equal to \p F.
      */
    void tag_unroll_level(tiramisu::var L, int F);

    /**
     * Identical to
     *     void tag_unroll_level(tiramisu::var L);
     */
    void tag_unroll_level(int L);

    /**
     * Identical to
     *     void tag_unroll_level(tiramisu::var L, int F);
     */
    void tag_unroll_level(int L, int F);

    /**
      * \brief Schedule this computation to run before the computation \p next_computation
      * at the loop level \p L and return \p next_computation.
      *
      * \details Notes
      *     - This method is a simple wrapper around computation::after to help schedule
      *       chaining as in:
      *       \code
      *       C1.then(C2, j)
      *         .then(C3, computation::root)
      *         .then(C4, i)
      *         .then(C5, j);
      *       \endcode
      *     - The loop level \p L is a loop level of this computation.
      *     - Use computation::root to indicate the root dimension
      *       (i.e. the outermost time-space dimension).
      *     - Calling this method with the same computations overwrites the level if it is
      *     higher.
      *     - A computation being scheduled after another computation at level L means it is
      *     scheduled after that computation at all levels lower than L.
      */
    // @{
    computation &then(computation &next_computation, var L=computation::root);
    // @}

    /**
      * \overload
      */
    computation &then(computation &next_computation, int L=computation::root_dimension);

    /**
      * Tile the two loop levels \p L0 and \p L1 with rectangular
      * tiling. \p sizeX and \p sizeY represent the tile size.
      * \p L0 and \p L1 should be two consecutive loop levels.
      * \p L0_outer, \p L1_outer, \p L0_inner, \p L1_inner
      * are the names of the new dimensions created after tiling.
      */
    // @{
    virtual void tile(var L0, var L1, int sizeX, int sizeY);
    virtual void tile(var L0, var L1, int sizeX, int sizeY,
                      var L0_outer, var L1_outer, var L0_inner, var L1_inner);
    virtual void tile(var L0, var L1, var L2, int sizeX, int sizeY, int sizeZ);
    virtual void tile(var L0, var L1, var L2, int sizeX, int sizeY, int sizeZ,
                      var L0_outer, var L1_outer, var L2_outer, var L0_inner,
                      var L1_inner, var L2_inner);
    // @}

    /**
      * Tile the two loop levels \p L0 and \p L1 with rectangular
      * tiling. \p sizeX and \p sizeY represent the tile size.
      * \p L0 and \p L1 should be two consecutive loop levels
      * (i.e., \p L0 = \p L1 + 1) and they should satisfy
      * \p L0 > \p L1.
      */
    // @{
    virtual void tile(int L0, int L1, int sizeX, int sizeY);
    virtual void tile(int L0, int L1, int L2, int sizeX, int sizeY, int sizeZ);
    // @}

    /**
      * Unroll the loop level \p L with an unrolling factor \p fac.
      *
      * The difference between this function and the function
      * tag_unroll_level() is that this function separates
      * the iteration domain into full and partial iteration
      * domains for unrolling first and then it calls
      * tag_unroll_level().
      * tag_unroll_level() only tags a dimension to
      * be unrolled, it does not modify the tagged dimension.
      *
      * This function separates the iteration domain into two iteration
      * domains, a full iteration domain and a partial iteration domain.
      * The full iteration domain has an upper bound that is multiple of
      * \p fac while the other does not.
      * The full iteration domain is then split by \p fac and the inner loop
      * (which should have a constant extent equal to \p fac) is tagged as
      * a unrolled loop.
      *
      * Let us assume the following loop (a loop represents and iteration
      * domain)
      *
      * \code
      * for (i=0; i<N; i++)
      *   for (j=0; j<23; j++)
      *     S0;
      * \endcode
      *
      * To unroll the j loop with an unrolling factor of 4, one should call
      *
      *      S0.unroll(j, 4);
      *
      * The loop (iteration domain) is first separated into the following
      * two loops
      *
      * \code
      * for (int i=0; i<20; i++)
      *   S0;
      *
      * for (int i=20; i<23; i++)
      *   S0;
      * \endcode
      *
      * The full loop is then split by 4
      *
      * \code
      * for (int i1=0; i1<20/4; i1++)
      *   for (int i2=0; i2<4; i2++)
      *      S0;
      *
      * for (int i=20; i<23; i++)
      *   S0;
      * \endcode
      *
      * the i2 loop is then tagged to be unrolled.
      *
      * \p L_outer and \p L_inner are the names of the new loops created
      * after splitting. If not provided, default names will be assigned.
      * \p L_outer is the outer loop.
      *
      */
    //@{
    virtual void unroll(var L, int fac);
    virtual void unroll(var L, int fac, var L_outer, var L_inner);
    virtual void unroll(int L, int fac);
    //@}

    /**
      * Vectorize the loop level \p L.  Use the vector length \p v.
      *
      * The difference between this function and the function
      * tag_vector_level() is that this function
      * prepares the iteration domain for vectorization first
      * and then it calls tag_vector_level().
      * tag_vector_level() only tags a dimension to
      * be vectorized, it does not change the tagged dimension.
      *
      * This function will separate the iteration domain into two iteration
      * domains, a full iteration domain and a partial iteration domain.
      * The full iteration domain has an upper bound that is multiple of
      * \p v while the other does not.
      * The full iteration domain is then split by \p v and the inner loop
      * (which should have a constant extent equal to \p v) is tagged as
      * a vector loop.
      *
      * Let us assume the following loop (a loop represents and iteration
      * domain)
      *
      * \code
      * for (i=0; i<N; i++)
      *   for (j=0; j<23; j++)
      *     S0;
      * \endcode
      *
      * To vectorize the j loop with a vector length 4, one should call
      *
      *      S0.vectorize(j, 4);
      *
      * The loop (iteration domain) is first separated into the following
      * two loops
      *
      * \code
      * for (int i=0; i<20; i++)
      *   S0;
      *
      * for (int i=20; i<23; i++)
      *   S0;
      * \endcode
      *
      * The full loop is then split by 4
      *
      * \code
      * for (int i1=0; i1<20/4; i1++)
      *   for (int i2=0; i2<4; i2++)
      *      S0;
      *
      * for (int i=20; i<23; i++)
      *   S0;
      * \endcode
      *
      * the i2 loop is then tagged to be vectorized.
      *
      * The user has to make sure that the extent of the dimension
      * is bigger than \p v. The vectorization of a loop that has
      * less than \p v iterations is not correct.
      *
      * The names of the new loop iterators created after vectorization
      * are \p L_outer and \p L_inner.  If not provided, default names
      * assigned.
      */
    // @{
    virtual void vectorize(var L, int v);
    virtual void vectorize(var L, int v, var L_outer, var L_inner);
    // @}

    /**
      * \brief Generate communication code for this computation
      *
      * Compute the sets that needs to be exchanged by the ranks, generate the corresponding
      * xfers, schedule the send, receive at root level if no computation was scheduled before,
      * map the received data to correct locations and allocate the required extra memory.
      *
      */
    void gen_communication();

    /**
      * Same as gen_communication(), but schedules send/recv at level l.
      *
      */
    void gen_communication(tiramisu::var l);

    /**
      * root_dimension is a number used to specify the dimension level
      * known as root.
      * The root dimension level is the outermost level.  It is the level
      * outside any loop nest.  Loop level 0 is the level of the first loop
      * (outermost loop), loop 1 is the level of following inner loop, ...
      *
      * Where is this number used ?
      *
      * These numbers are used in the helper functions used for scheduling
      * (such as after(), before(), ...).
      * For example, c0.after(c1) indicates that the computation c0 should
      * be executed after the computation c1.
      * Since the two computations c0 and c1 are usually nested in a loop,
      * we need to specify at which loop level c0 is after c1. This is where
      * we need to specify the loop level numbers.
      * Here is an example.  Suppose that the two computations c0 and c1
      * have the following iteration domains
      * {c0[i,j]: 0<=i<N and 0<=j<N} and {c1[i,j]: 0<=i<N and 0<=j<N}.
      *
      * When code is generated for the two computations, two loop nests
      * are generated.  When scheduling c0 after c1 using the after function,
      * the user can choose one among three possibilities in specifying at
      * which level c0 is after c1.
      *
      * - c0.after(c1, computation::root_dimension) would create a schedule
      * that generates the following code
      *
      * \code
      * for (i=0; i<N; i++)
      *     for (j=0; j<N; j++)
      *         c1;
      * for (i=0; i<N; i++)
      *     for (j=0; j<N; j++)
      *         c0;
      * \endcode
      *
      * - c0.after(c1, 0) would create a schedule that generates the
      * following code
      *
      * \code
      * for (i=0; i<N; i++) {
      *     for (j=0; j<N; j++)
      *         c1;
      *     for (j=0; j<N; j++)
      *         c0;
      * }
      * \endcode
      *
      * This means that c0 is after c1 starting from loop level 0,
      * (before the loop level 0, c0 and c1 have the same order).
      *
      * - c0.after(c1, 1) would create a schedule that generates the
      * following code
      *
      * \code
      * for (i=0; i<N; i++)
      *     for (j=0; j<N; j++) {
      *         c1;
      *         c0;
      *     }
      * \endcode
      *
      * This means that c0 is after c1 starting from loop level 1,
      * (before the loop level 1, c0 and c1 have the same order).
      */
    const static var root;

    /**
      * Equivalent of computation::root but to be used with scheduling
      * functions that take loop level (integers) as input instead of
      * tiramisu::var.
      */
    const static int root_dimension = -1;

    /**
      * Access operator: C0(i,j) represents an access to
      * the element (i,j) of the computation C0.
      * C0(i,j) represents the value computed by the computation
      * C0(i,j)
      */
    template<typename... Args> tiramisu::expr operator()(Args... args)
    {
        // TODO move to cpp
        std::vector<tiramisu::expr> access_expressions{std::forward<Args>(args)...};
        if (access_expressions.size() != this->number_of_dims)
        {
            tiramisu::str_dump("Error - Incorrect access: " + this->get_name() + "(");
            for (int i = 0; i < access_expressions.size(); i++)
            {
                tiramisu::expr e = access_expressions[i];
                e.dump(false);
                if (i != access_expressions.size() - 1)
                    tiramisu::str_dump(", ");
            }
            tiramisu::str_dump(").\n");
            tiramisu::str_dump("The number of access dimensions does not match that used in the declaration of " + this->get_name() + ".\n\n");
            exit(1);
        }

        if (this->is_inline_computation()) {
            std::vector<std::pair<var, expr>> substitutions;
            for (auto const &variable: this->access_variables) {
                // variable is an (index, variable_name) pair
                substitutions.push_back(std::make_pair(var(variable.second, false),
                                                       access_expressions[variable.first]));
            }
            // TODO add iteration space for expression
            return this->expression.substitute(substitutions);
        } else {
            return tiramisu::expr(tiramisu::o_access,
                                  this->get_name(),
                                  access_expressions,
                                  this->get_data_type());
        }
    }

    operator expr();

    static xfer create_xfer(std::string send_iter_domain, std::string recv_iter_domain, tiramisu::expr send_dest,
                            tiramisu::expr recv_src, xfer_prop send_prop, xfer_prop recv_prop,
                            tiramisu::expr send_expr, tiramisu::function *fct);

    static xfer create_xfer(std::string iter_domain, xfer_prop prop, tiramisu::expr expr,
                            tiramisu::function *fct);
};

class input: public computation
{
private:

    /**
      * Compute a vector of loop iterators from a vector of sizes.
      * If the input is {10, 20}, this function returns a vector
      * that has the following iterators var("random_name_0, 0, 10)
      * and var("random_name_1", 0, 20.
      */
    static std::vector<var>
    compute_iterators_from_sizes(std::vector<std::string> dimension_names,
            std::vector<tiramisu::expr> dimension_sizes)
    {
        assert(dimension_sizes.size() != 0);

        std::vector<var> iterator_variables;

        for (int i = 0; i < dimension_sizes.size(); i++)
        {
            tiramisu::var *v = new
                tiramisu::var(dimension_names[i], 0, dimension_sizes[i]);
            iterator_variables.push_back(*v);
        }

        return iterator_variables;
    }

public:
    /**
      * \brief Constructor for an input.
      *
      * \details
      *
      * Declare an input.
      *
      * \p name is the name of the input.
      *
      * \p iterator_variables is a vector that represents the dimensions of
      * the input.  It is used to define the size of the input.
      *
      * \p t is the type of the input elements.
      * Example of types include (p_uint8, p_uint16, p_uint32, ...).
      * Types are defined in \ref tiramisu::primitive_t
      *
      * Example:
      *
      * To declare a buffer buf[20, 30] where the buffer elements
      * are of type uint8.  We can first declare two iterator variables
      *
      * \code
      * var i("i", 0, 20), j("j", 0, 30);
      * \endcode
      *
      * and then we can declare the following input
      *
      * \code
      * input A("A", {i,j}, p_uint8);
      * \endcode
      *
      * Later in the code (in Layer III), we need to actually declare
      * the buffer and map this input to that buffer.
      *
      * An example is provided in tutorial 02.
      *
     */
    input(std::string name, std::vector<var> iterator_variables, primitive_t t):
	    computation(name, iterator_variables, expr(t), false)
    {
    }

    /**
      * \overload
      */
    input(std::vector<var> iterator_variables, primitive_t t):
	    input(generate_new_computation_name(), iterator_variables, t)
    {
    }

    /**
      * \brief Constructor for an input.
      *
      * \details
      *
      * Declare an input.
      *
      * \p name is the name of the input.
      *
      * \p dimension_names is a vector that represents the names of the input
      * dimensions.
      *
      * \p dimension_sizes is a vector that represents the sizes of the input
      * dimensions.
      *
      * \p t is the type of the input elements.
      * Example of types include (p_uint8, p_uint16, p_uint32, ...).
      * Types are defined in \ref tiramisu::primitive_t
      *
      * Example:
      *
      * To declare a buffer buf[20, 30] where the buffer elements
      * are of type uint8 and where the first dimension is called
      * "i" and the second dimension is "j".
      *
      * \code
      * input A("A", {"i", "j"}, {20, 30}, p_uint8);
      * \endcode
      *
      * Later in the code (in Layer III), we need to actually declare
      * the buffer and map this input to that buffer.
      *
      * An example is provided in tutorial 02.
      *
     */
    input(std::string name, std::vector<std::string> dimension_names,
			    std::vector<tiramisu::expr> dimension_sizes, primitive_t t):
	computation(name, compute_iterators_from_sizes(dimension_names, dimension_sizes), expr(t), false)
    {
    }

    // ADD:FLEXNLP
    /**
      * \brief Constructor for an input.
      *
      * \details
      *
      * Declare an input.
      *
      * \p name is the name of the input.
      *
      * \p iterator_variables is a vector that represents the dimensions of
      * the input.  It is used to define the size of the input.
      *
      * \p t is the type of the input elements.
      * Example of types include (p_uint8, p_uint16, p_uint32, ...).
      * Types are defined in \ref tiramisu::primitive_t
      *
      * \p cpu_buffer_to_map_to_device is the buffer that is mapped to this input
      * in the device.
      *
      * Example:
      *
      * To declare a buffer buf[20, 30] where the buffer elements
      * are of type float32, that is the mapping of the cpu_A buffer.
      * We can first declare two iterator variables
      *
      * \code
      * var i("i", 0, 20), j("j", 0, 30);
      * \endcode
      *
      * and then we can declare the following buffer
      *
      * \code
      * buffer cpu_A("cpu_A", {10,30}, p_float32, a_input);
      * \endcode
      *
      * and then create the corresponding device input
      *
      * \code
      * input A("A", {i,j}, p_float32, cpu_A);
      * A.get_buffer()->tag_gpu_global()
      * \endcode
      *
      * This declaration's advantage, is that the buffer cpu_A
      * will get automatically copied to A's buffer which will be
      * automatically allocated.
      *
     */
    input(std::string name, std::vector<var> iterator_variables, primitive_t t, tiramisu::buffer cpu_buffer_to_map_to_device):
      computation::computation(name, iterator_variables, expr(), expr(t), false, t, cpu_buffer_to_map_to_device)
    {
    }

    /**
      * \overload
      */
      input(std::vector<var> iterator_variables, primitive_t t, tiramisu::buffer cpu_buffer_to_map_to_device):
        input(generate_new_computation_name(), iterator_variables, t, cpu_buffer_to_map_to_device)
    {
    }
};

class Input: public input{

public:
    std::vector<var> iterators_from_size_expressions(std::vector<expr> sizes)
    {
	std::vector<var> iterator_variables;

	for (int s = 0; s < sizes.size(); s++)
	{
	    var v(generate_new_computation_name(), 0, sizes[s]);
	    iterator_variables.push_back(v);
	}

	return iterator_variables;
    }

    /**
      * \overload
      */
    Input(std::string name, std::vector<expr> sizes, primitive_t t)
	:input(name, iterators_from_size_expressions(sizes), t)
    {
    }
};

class view:public computation{

public:
    /**
      * \brief Constructor for a view.
      *
      * \details
      *
      * Declare a view.
      *
      * \p name is the name of the view.
      *
      * \p iterator_variables is a vector that represents the dimensions of
      * the view.  It is used to define the size of the view.
      *
      * \p t is the type of buffer data (p_float32 for example).
      *
      * Example:
      *
      * If we want to access a buffer buf where results of a computation C are stored
      * we declare a view V and we map it to the buffer buf.
      *
      * We declare the iterator variables
      *
      * \code
      * var i("i", 0, 20), j("j", 0, 30);
      * \endcode
      *
      * and then we can declare the following view
      *
      * \code
      * view V("V", {i,j}, p_float64);
      * \endcode
      *
      * Later in the code (in Layer III), we need to map the view to that buffer.
      * \code
      * V.store_in(&buf);
      * \endcode
     */
   view(std::string name, std::vector<var> iterator_variables, primitive_t t):
	computation(name, iterator_variables, expr(), false,t){}

};




/**
  * A class that represents loop invariants.
  *
  * An object of the invariant class can be an expression, a symbolic constant
  * or a variable that is invariant to all the loops of the function.
  */
class constant: public computation
{
private:
    /**
      * If this constant is not function wide, i.e., if it is computed
      * with a computation, then \p compute_with_computation is a pointer on the
      * computation with whom this constant should be computed.
      * We need to know this because we need to translate the accesses
      * used within the RHS of this contant expression to the new scheduled
      * accesses. Since a computation does not have a buffer access (it is
      * translated into a let statement), it also does not have an iterator
      * map, thus it cannot translate its accesses to scheduled accesses.
      * Thus we use the iterator map of the computation with whome we
      * compute this constant and take its iterator map.
      */
    tiramisu::computation *compute_with_computation;

public:

    /**
      * Create a constant where \p param_name is the name of
      * the constant that will hold the value of the constant.
      *
      * \p param_expr is the expression that defines the value
      * of the constant.
      *
      * \p t indicates the type of the constant.
      *
      * \p function_wide should be set to true if the constant is
      * defined at the entry of the function and is visible to all
      * the computations.
      * If function_wide is set to true, then the constant is an
      * invariant to the whole function where it is declared.
      *
      * \p with_computation, should be set only if function_wide
      * is false, i.e. if the constant is not function wide.
      * In such a case the user should indicate where the
      * constant should be assigned.
      * \p with_computation indicates that the assignment should
      * be in the loop nest that computes the computation indicated by
      * \p with_computation at the loop level indicated
      * by \p at_loop_level.
      * The root level (i.e. the level outer than any other loop level)
      * is computation::root_dimension.
      * 0 represents the first loop level and 1 represents the second
      * loop level, ...
      *
      * \p func is the function in which the constant is defined. If this
      * argument is not provided, the implicit Tiramisu function is used
      * (i.e., the function created during Tiramisu initialization).
      */
    constant(std::string param_name, const tiramisu::expr &param_expr,
             tiramisu::primitive_t t,
             bool function_wide,
             tiramisu::computation *with_computation,
             int at_loop_level,
             tiramisu::function *func = global::get_implicit_function());

    /**
      * This constructor has the same behaviour as
      * \code
         constant(std::string param_name, const tiramisu::expr &param_expr,
             tiramisu::primitive_t t,
             bool function_wide,
             tiramisu::computation *with_computation,
             int at_loop_level,
             tiramisu::function *func);
	\endcode
      * except that it has less arguments.
      */
    constant(std::string param_name, const tiramisu::expr &param_expr,
             tiramisu::primitive_t t,
             tiramisu::function *func = global::get_implicit_function());

    /**
      * \brief Create a constant.
      *
      * \details
      *
      * Define a constant (scalar) at the entry of the function and make it visible to all
      * the computations.
      *
      * \p param_name is the name of the constant.
      * \p param_expr is the expression that defines the value of the constant.
      *
      */
    constant(std::string param_name, const tiramisu::expr &param_expr);

    /**
      * Get the computation with whom this constant is computed.
      */
    tiramisu::computation *get_computation_with_whom_this_is_computed()
    {
        return compute_with_computation;
    }

    /**
      * Dump the invariant on standard output (dump most of the fields of
      * the invariant class).
      * This is mainly useful for debugging.
      * If \p exhaustive is set to true, all the fields of the invariant
      * class are printed.  This is useful to find potential initialization
      * problems.
      */
    void dump(bool exhaustive) const;

    operator expr();
};


/**
* A class for code generation.
*/
class generator
{
    friend function;
    friend computation;
    friend buffer;
    friend cuda_ast::generator;

protected:

    /*
     * Compute the iterators map.
     * The iterators map is map between the original names of the iterators of a computation
     * and their transformed form after schedule (also after renaming).
     *
     * If in the original computation, we had
     *
     * {C[i0, i1]: ...}
     *
     * And if in the generated code, the iterators are called c0, c1, c2 and c3 and
     * the loops are tiled, then the map will be
     *
     * {<i0, c0*10+c2>, <i1, c1*10+c3>}.
     *
     **/
    static std::map<std::string, isl_ast_expr *>
        compute_iterators_map(tiramisu::computation *comp, isl_ast_build *build);

   /**
     * Get the computation associated with a node.
     */
    static std::vector<tiramisu::computation *>
        get_computation_by_node(tiramisu::function *fct, isl_ast_node *node);

    /**
     * Traverse the vector of computations \p comp_vec and return the computations
     * that have a domain that intersects with \p domain.
     */
    static std::vector<tiramisu::computation *> filter_computations_by_domain(std::vector<tiramisu::computation *> comp_vec,
            isl_union_set *node_domain);

    /**
     * Compute the accesses of the RHS of the computation
     * \p comp and store them in the accesses vector.
     *
     * If \p return_buffer_accesses is set to true, this function returns access functions to
     * buffers. Otherwise it returns access functions to computations.
     */
    static void get_rhs_accesses(const tiramisu::function *func, const tiramisu::computation *comp,
                          std::vector<isl_map *> &accesses, bool return_buffer_accesses);

    /**
     * Analyze the \p access_expression and return a set of constraints
     * that correspond to the access pattern of the access_expression.
     *
     * access_dimension:
     *      The dimension of the access. For example, the access
     *      C0(i0, i1, i2) have three access dimensions: i0, i1 and i2.
     * access_expression:
     *      The expression of the access.
     *      This expression is parsed recursively (by calling get_constraint_for_access)
     *      and is gradually used to update the constraint.
     * access_relation:
     *      The access relation that represents the access.
     * cst:
     *      The constraint that defines the access and that is being constructed.
     *      Different calls to get_constraint_for_access modify this constraint
     *      gradually until the final constraint is created. Only the final constraint
     *      is added to the access_relation.
     * coeff:
     *      The coefficient in which all the dimension coefficients of the constraint
     *      are going to be multiplied. This coefficient is used to implement o_minus,
     *      o_mul and o_sub.
     */
    static isl_constraint *get_constraint_for_access(int access_dimension,
                                                     const tiramisu::expr &access_expression,
                                                     isl_map *access_relation,
                                                     isl_constraint *cst,
                                                     int coeff,
                                                     const tiramisu::function *fct);

    /**
     * Extract tags from the ISL ast node at given level. This is a helper
     * function meant to be used from halide_stmt_from_isl_node. Traverses
     * the ISL ast tree and fills the tagged_stmts vector.
     */
    static void extract_tags_from_isl_node(const tiramisu::function &fct, isl_ast_node *node, int level,
                                           std::vector<std::pair<std::string, std::string>> &tagged_stmts);

    /**
      * Generate a Halide statement from an ISL ast node object in the ISL ast
      * tree.
      * Level represents the level of the node in the schedule. 0 means root.
      * It taks as input:
      *     - a function \p fct for which we are generating code,
      *     - a \p node,
      *     - \p level represents the current loop level being traversed (0 means the outer level.
      *     - \p is_a_child_block indicates whether the block that is ging to be
      *     generated is a child block for an other block. In such a case, allocate
      *     and let statements should not be generate. Allocate and let statements
      *     should only be generated in non-child blocks so that their scope reaches
      *     the whole block.
      */
    static Halide::Internal::Stmt halide_stmt_from_isl_node(const tiramisu::function &fct, isl_ast_node *node,
                                                            int level,
                                                            std::vector<std::pair<std::string, std::string>> &tagged_stmts,
                                                            bool is_a_child_block);

    // TODO doc
    static Halide::Internal::Stmt make_halide_block(const Halide::Internal::Stmt &first,
            const Halide::Internal::Stmt &second);

    static Halide::Internal::Stmt make_buffer_alloc(buffer *b, const std::vector<Halide::Expr> &extents,
                                                    Halide::Internal::Stmt &stmt);
    static Halide::Internal::Stmt make_buffer_free(buffer *b);

    /**
     * Create a Halide expression from a  Tiramisu expression.
     */
    static Halide::Expr halide_expr_from_tiramisu_expr(const tiramisu::function *fct,
            std::vector<isl_ast_expr *> &index_expr,
            const tiramisu::expr &tiramisu_expr, tiramisu::computation *comp = nullptr);

    static tiramisu::expr replace_accesses(const tiramisu::function * func, std::vector<isl_ast_expr *> &index_expr,
                                           const tiramisu::expr &tiramisu_expr);
    static std::string get_buffer_name(const tiramisu::computation *);
    static tiramisu::expr comp_to_buffer(tiramisu::computation * comp, std::vector<isl_ast_expr *> &ie,
                                                    const tiramisu::expr * expr = nullptr);

    /**
     * Linearize a multidimensional access to a Halide buffer.
     * Supposing that we have buf[N1][N2][N3], transform buf[i][j][k]
     * into buf[k + j*N3 + i*N3*N2].
     * Note that the first arg in index_expr is the buffer name.  The other args
     * are the indices for each dimension of the buffer.
     */
    //@{
    static Halide::Expr linearize_access(int dims, const halide_dimension_t *shape, isl_ast_expr *index_expr);
    static Halide::Expr linearize_access(int dims, const halide_dimension_t *shape, std::vector<tiramisu::expr> index_expr);
    static Halide::Expr linearize_access(int dims, std::vector<Halide::Expr> &strides, std::vector<tiramisu::expr> index_expr);
    static Halide::Expr linearize_access(int dims, std::vector<Halide::Expr> &strides, isl_ast_expr *index_expr);
    static tiramisu::expr linearize_access(int dims, std::vector<tiramisu::expr> &strides, std::vector<tiramisu::expr> index_expr);
    static tiramisu::expr linearize_access(int dims, std::vector<tiramisu::expr> &strides, isl_ast_expr *index_expr);
    //@}

    /**
     * Retrieve the access function of the ISL AST leaf node (which represents a
     * computation). Store the access in computation->access.
     */
    static isl_ast_node *stmt_code_generator(isl_ast_node *node, isl_ast_build *build, void *user);

    /**
     * Traverse a tiramisu expression (\p exp) and extract the access relations
     * from the access operation passed in \p exp.  The access relations are added
     * to the vector \p accesses.
     * The access relation is from the domain of the computation \p comp to the
     * domain of the computation accessed by the access operation.
     * If \p return_buffer_accesses = true, an access to a buffer is created
     * instead of an access to computations.
     */
    static void traverse_expr_and_extract_accesses(const tiramisu::function *fct,
                                            const tiramisu::computation *comp,
                                            const tiramisu::expr &exp,
                                            std::vector<isl_map *> &accesses,
                                            bool return_buffer_accesses);

    /**
     * Traverse a tiramisu expression (\p current_exp) until an expression with the specified name is found.
     * Replace that name with a new name. Replaces all occurrences.
     */
    static void _update_producer_expr_name(tiramisu::expr &current_exp, std::string name_to_replace,
                                           std::string replace_with);

public:

    /**
     * Traverse a tiramisu expression (\p current_exp) until an expression with the specified name is found.
     * Replace that name with a new name. Replaces all occurrences.
     */
    static void update_producer_expr_name(tiramisu::computation *comp, std::string name_to_replace,
                                          std::string replace_with);
};

/**
 * A class containing utility functions.
 */
class utility
{
public:
    /**
     * Traverse recursively the ISL AST tree
     *
     * \p node represents the root of the tree to be traversed.
     *
     * \p dim is the dimension of the loop from which the bounds have to be
     * extracted.
     *
     * \p upper is a boolean that should be set to true to extract
     * the upper bound and false to extract the lower bound.
     */
     static expr extract_bound_expression(isl_ast_node *ast, int dim, bool upper);

    /**
     * Return a tiramisu::expr representing the bound of
     * the dimension \p dim in \p set.  If \p upper is true
     * then this function returns the upper bound otherwise
     * it returns the lower bound.
     *
     * For example, assuming that
     *
     * S = {S[i,j]: 0<=i<N and 0<=j<N and i<M}
     *
     * then
     *
     * get_upper_bound(S, 1)
     *
     * would return N-1, while
     *
     * get_upper_bound(S, 0)
     *
     * would return min(N-1,M-1)
     */
    static tiramisu::expr get_bound(isl_set *set, int dim, int upper);

    /**
     * Return the extent of the loop.
     *
     * For example:
     *
     * [N]->{C[i,j]: 0 <= i < N and N = 10}
     *
     * then get_extent(C,0) would return 10.
     *
     */
    static int get_extent(isl_set *set, int dim);

    /**
     * Create a comma separated string that represents the list
     * of the parameters of \p set.
     *
     * For example, if the set is
     *
     * [N,M,K]->{S[i]}
     *
     * this function returns the string "N,M,K".
     */
    static std::string get_parameters_list(isl_set *set);
};

// TODO Jess: add doc comments

class xfer_prop {
    friend class communicator;
private:

    std::vector<tiramisu::xfer_attr> attrs;

    tiramisu::primitive_t dtype;

    int xfer_prop_id;

    xfer_prop();

public:

    xfer_prop(tiramisu::primitive_t d_type, std::initializer_list<tiramisu::xfer_attr> attrs);

    xfer_prop(tiramisu::primitive_t d_type, std::initializer_list<tiramisu::xfer_attr> attrs,
              int xfer_prop_id);

    static std::set<int> xfer_prop_ids;

    static std::string attr_to_string(xfer_attr attr);

    tiramisu::primitive_t get_dtype() const;

    int get_xfer_prop_id() const;

    bool contains_attr(tiramisu::xfer_attr attr) const;

    bool contains_attrs(std::vector<tiramisu::xfer_attr> attrs) const;

    void add_attr(tiramisu::xfer_attr attr);

};

class communicator : public computation {
private:

    std::vector<tiramisu::expr> dims;

protected:

    xfer_prop prop;

    communicator();

public:

    communicator(std::string iteration_domain_str, tiramisu::expr rhs, bool schedule_this_computation,
                 tiramisu::primitive_t data_type, tiramisu::function *fct);

    communicator(std::string iteration_domain_str, tiramisu::expr rhs,
                 bool schedule_this_computation, tiramisu::primitive_t, tiramisu::xfer_prop prop,
                 tiramisu::function *fct);

    xfer_prop get_xfer_props() const;

    tiramisu::expr get_num_elements() const;

    void add_dim(tiramisu::expr size);

    /**
      * Collapse a loop level.
      */
    std::vector<communicator *> collapse(int level, tiramisu::expr collapse_from_iter,
                                         tiramisu::expr collapse_until_iter, tiramisu::expr num_collapsed);

    /**
      * Collapse several consecutive loop levels (must collapse from innermost towards outermost).
      */
    void collapse_many(std::vector<collapse_group> collapse_each);

};

class send : public communicator {
private:

    tiramisu::computation *producer = nullptr;

    recv *matching_recv = nullptr;

    tiramisu::expr msg_tag;

    tiramisu::expr src;

    tiramisu::expr dest;

    static int next_msg_tag;

public:
    // TODO (Jess) is producer ever used?
    send(std::string iteration_domain_str, tiramisu::computation *producer, tiramisu::expr rhs,
         xfer_prop prop, bool schedule_this_computation, std::vector<expr> dims,
         tiramisu::function *fct);

    virtual bool is_send() const override;

    virtual void add_definitions(std::string iteration_domain_str, tiramisu::expr e,
                                 bool schedule_this_computation, tiramisu::primitive_t t,
                                 tiramisu::function *fct) override;

    tiramisu::computation *get_producer() const;

    tiramisu::recv *get_matching_recv() const;

    tiramisu::expr get_msg_tag() const;

    tiramisu::expr get_src() const;

    tiramisu::expr get_dest() const;

    void set_matching_recv(tiramisu::recv *matching_recv);

    void set_src(tiramisu::expr src);

    void set_dest(tiramisu::expr dest);

    void override_msg_tag(tiramisu::expr msg_tag);
};

class recv : public communicator {
private:

    send *matching_send = nullptr;

    tiramisu::expr src;

    tiramisu::expr dest;

    tiramisu::expr msg_tag;

public:

    recv(std::string iteration_domain_str, bool schedule_this, tiramisu::xfer_prop prop,
         tiramisu::function *fct);

    virtual bool is_recv() const override;

    virtual void add_definitions(std::string iteration_domain_str, tiramisu::expr e,
                                 bool schedule_this_computation, tiramisu::primitive_t t,
                                 tiramisu::function *fct) override;

    tiramisu::send *get_matching_send() const;

    tiramisu::expr get_msg_tag() const;

    tiramisu::expr get_src() const;

    tiramisu::expr get_dest() const;

    void set_matching_send(tiramisu::send *matching_send);

    void set_src(tiramisu::expr src);

    void set_dest(tiramisu::expr dest);

    void override_msg_tag(tiramisu::expr msg_tag);

};

class send_recv : public communicator {
public:

    send_recv(std::string iteration_domain_str, tiramisu::computation *producer,
                  tiramisu::expr rhs, xfer_prop prop, bool schedule_this_computation,
                  std::vector<expr> dims, tiramisu::function *fct);

    virtual bool is_send_recv() const override;

};

class wait : public communicator {
private:

    tiramisu::expr rhs;

public:

    wait(tiramisu::expr rhs, xfer_prop prop, tiramisu::function *fct);

    wait(std::string iteration_domain_str, tiramisu::expr rhs, xfer_prop prop, bool schedule_this,
         tiramisu::function *fct);

    virtual bool is_wait() const override;

    virtual void add_definitions(std::string iteration_domain_str, tiramisu::expr e,
                                 bool schedule_this_computation, tiramisu::primitive_t t,
                                 tiramisu::function *fct) override;

    std::vector<tiramisu::computation *> get_op_to_wait_on() const;

};

// Halide IR specific functions

void halide_stmt_dump(Halide::Internal::Stmt s);

Halide::Module lower_halide_pipeline(
    const std::string &pipeline_name,
    const Halide::Target &t,
    const std::vector<Halide::Argument> &args,
    const Halide::LinkageType linkage_type,
    Halide::Internal::Stmt s);

int loop_level_into_dynamic_dimension(int level);
int loop_level_into_static_dimension(int level);
/**
  * TODO code cleaning:
  * - Go to the tutorials, add a small explanation about how Tiramisu should work in general.
  * - Add two pages explaining how one should use Tiramisu,
  *
  * - Have documentation on header files only,
  * - Order the functions in the class computations (get functions then update functions ordered in alphabetical order),
  * - Clean/document expr.h and type.h
  */

}

#endif
